From e59073243d6efdf1e469925bc813ac2e7c104d4e Mon Sep 17 00:00:00 2001 From: Brent Rager Date: Wed, 3 Jun 2026 18:57:25 -0400 Subject: [PATCH] =?UTF-8?q?Pearl=20th-491e0c:=20PiDriver=20=E2=80=94=20dri?= =?UTF-8?q?ve=20@earendil-works/pi-coding-agent=20via=20tmux?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces canceled ClaudeCodeDriver (pearl th-36145e) per user direction 2026-06-03: "we are more in the business of opencode and pi". Mirrors OpenCodeDriver / SmoothDriver shape — interactive TUI in a tmux pane so all three backends measure agentic discipline through the same surface (boot, paste, idle, auto-coach, idle, capture). Pi setup the bench expects: - `pi` on PATH (or under `~/.nvm/versions/node/*/bin/pi`) - `~/.pi/agent/models.json` declaring a `smooai` provider pointing at `https://llm.smoo.ai/v1` with the API key inline (chmod 600). which_pi() falls back to nvm install dirs because the bench's tmux subshell doesn't always inherit the user's nvm-shimmed PATH. Shell command shape: `pi --no-session --provider smooai [--model X]`. `--no-session` keeps runs ephemeral; pinning the smooai provider keeps routing identical to how opencode + smooth hit the same llm.smoo.ai endpoint. First run — 4 fixtures × `pi` × smooai/deepseek-v4-flash, strict coach: cleanup-disk-bloat : 1.000 cleanup-impossible-task : 1.000 (honest refusal) cleanup-node-modules-orphans : 1.000 (freed 3,686,400 bytes) cleanup-pycache-debris : 1.000 AGGREGATE : 1.000 For comparison on the same fixtures+model+coach: Mock baseline : 1.000 (all 4, bash scripts) Pi : 1.000 (all 4, first try) OpenCode : ≥0.93 (verified on 3, expect ~1.000 sweep) Smooth : 0.789 aggregate (run-to-run variance; pycache flakes between 0.43 and 1.00) Pi and OpenCode are now reference points for what smooth needs to match. Next phase: extract observable behaviors that make pi + opencode reliably succeed (text-plan enumeration, robust inter-turn context, picker-free confirmations) and apply them to smooth-code. --- crates/smooth-bench/src/agent_driver.rs | 161 ++++++++++++++++++++++++ crates/smooth-bench/src/main.rs | 14 ++- 2 files changed, 171 insertions(+), 4 deletions(-) diff --git a/crates/smooth-bench/src/agent_driver.rs b/crates/smooth-bench/src/agent_driver.rs index 4dc7b9a..a16eb1e 100644 --- a/crates/smooth-bench/src/agent_driver.rs +++ b/crates/smooth-bench/src/agent_driver.rs @@ -907,6 +907,167 @@ fn drive_smooth_via_tmux( }) } +// ── PiDriver: drive `pi` (Earendil's coding agent) through tmux ───── + +/// Driver that spawns `pi` (`@earendil-works/pi-coding-agent`) inside a +/// tmux pane. Pearl `th-491e0c`. Drives the interactive TUI for +/// apples-to-apples parity with [`OpenCodeDriver`] + [`SmoothDriver`]. +/// +/// Pi has a native print mode (`pi -p "…"`) that would let us skip +/// tmux entirely, but driving all three backends through the same +/// surface (tmux + paste + idle) is what guarantees the scores stay +/// comparable. Differences in agentic behavior shouldn't be +/// confounded by surface-specific quirks. +/// +/// Spawned command: +/// +/// ```bash +/// pi --no-session --provider smooai [--model ] +/// ``` +/// +/// `--no-session` keeps the run ephemeral (no `~/.pi/agent/sessions/` +/// pollution). `--provider smooai` selects the custom provider in +/// `~/.pi/agent/models.json` that points at `llm.smoo.ai`. Model is +/// passed through from the bench's `--model` flag verbatim; `pi` +/// accepts either `provider/model` or `model` if the provider is +/// pre-selected. +/// +/// Pre-flight: requires `pi` on PATH and a `~/.pi/agent/models.json` +/// declaring the `smooai` provider with credentials. If `pi` isn't +/// installed the driver returns an `agent_error` rather than killing +/// the sweep — same shape as `OpenCodeDriver`. +pub struct PiDriver { + /// Path to the `pi` binary. Resolved via `which_pi()` at + /// construction; falls back to `pi` (bare name on PATH) when + /// resolution fails — the spawn step will surface the failure as + /// an agent_error per task. + binary: PathBuf, +} + +impl PiDriver { + /// Construct from PATH. Falls back to bare `pi` if resolution + /// fails so the dispatch path can surface a clean error. + #[must_use] + pub fn from_path() -> Self { + Self { + binary: which_pi().unwrap_or_else(|| PathBuf::from("pi")), + } + } + + /// Construct from an explicit `pi` binary path. Intended for + /// tests and for benching specific installs (e.g. an nvm-managed + /// version pinned at `~/.nvm/versions/node/v22.19.0/bin/pi`). + #[must_use] + pub fn with_binary(binary: PathBuf) -> Self { + Self { binary } + } +} + +impl Default for PiDriver { + fn default() -> Self { + Self::from_path() + } +} + +/// Walk PATH for the `pi` binary, falling back to known nvm install +/// directories. We check nvm explicitly because nvm's shell-init +/// hooks don't propagate cleanly across the bench's spawned `sh -c` +/// invocations — even when `pi` is "on PATH" for the user's +/// interactive shell, the bench's tmux subshell may not see it. +fn which_pi() -> Option { + if let Some(path) = std::env::var_os("PATH") { + for dir in std::env::split_paths(&path) { + let candidate = dir.join("pi"); + if candidate.is_file() { + return Some(candidate); + } + } + } + // Fallback: scan nvm's installed Node versions for a `bin/pi`. + if let Some(home) = dirs_next::home_dir() { + let nvm_node = home.join(".nvm").join("versions").join("node"); + if let Ok(entries) = std::fs::read_dir(&nvm_node) { + for entry in entries.flatten() { + let candidate = entry.path().join("bin").join("pi"); + if candidate.is_file() { + return Some(candidate); + } + } + } + } + None +} + +#[async_trait] +impl AgentDriver for PiDriver { + fn name(&self) -> &'static str { + "pi" + } + + async fn dispatch(&self, req: DispatchRequest<'_>) -> Result { + let binary = self.binary.clone(); + let task_id = req.task_id.to_string(); + let workspace = req.workspace.to_path_buf(); + let prompt = req.prompt.to_string(); + let model = req.model.map(str::to_string); + let timeout = req.timeout; + let coach = req.coach; + tokio::task::spawn_blocking(move || drive_pi_via_tmux(&binary, &task_id, &workspace, &prompt, model.as_deref(), timeout, coach)) + .await + .context("pi driver join") + } +} + +/// Build the `sh -c` command tmux runs to launch pi's TUI. +/// +/// We pin `--provider smooai` so the run goes through `llm.smoo.ai` +/// (configured in `~/.pi/agent/models.json` — same auth path the user +/// configures once interactively). `--no-session` keeps the run +/// ephemeral. +fn pi_shell_cmd(binary: &Path, model: Option<&str>) -> String { + let mut cmd = shell_escape(&binary.to_string_lossy()); + cmd.push_str(" --no-session --provider smooai"); + if let Some(m) = model { + cmd.push_str(" --model "); + cmd.push_str(&shell_escape(m)); + } + cmd +} + +/// Sync core of the pi driver. Hands off to [`drive_tmux_agent`] with +/// the pi-flavored spec. +fn drive_pi_via_tmux( + binary: &Path, + task_id: &str, + workspace: &Path, + prompt: &str, + model: Option<&str>, + timeout: Duration, + coach: CoachMode, +) -> AgentRunArtifacts { + drive_tmux_agent(TmuxAgentSpec { + driver_name: "pi", + shell_cmd: pi_shell_cmd(binary, model), + // Pi's TUI is a Node.js process — boot is fast (~1-3s on a + // warm node_modules cache, longer on a cold one). 30s is the + // same conservative ceiling as OpenCode; we'd rather fail + // fast on a broken install than burn the per-task budget. + boot_timeout: Duration::from_secs(30), + paste_warmup: Duration::from_millis(800), + // Pi shows visible token-streaming when the model responds, + // so the 8s dwell that works for OpenCode should work here + // too. If pi turns out to have a static spinner state like + // smooth's `Thinking...`, bump to 15-20s. + first_idle_dwell: Duration::from_secs(8), + post_coach_dwell: Duration::from_secs(5), + task_id, + workspace, + prompt, + timeout, + coach, + }) +} + /// Return the substring of `pane` AFTER the last occurrence of a /// stable prefix of `prompt`. If the prompt can't be found in the /// pane (TUI reflow ate it), returns the whole pane as a fallback. diff --git a/crates/smooth-bench/src/main.rs b/crates/smooth-bench/src/main.rs index ea91cd3..d9ecf8c 100644 --- a/crates/smooth-bench/src/main.rs +++ b/crates/smooth-bench/src/main.rs @@ -422,9 +422,12 @@ struct ScoreCleanupArgs { enum AgentDriverKind { Mock, Opencode, - /// Smooth's own `th code`. Pearl th-754512 — not wired yet. + /// Smooth's own `th code`. Pearl th-754512. Smooth, - /// Claude Code's `claude -p`. Pearl th-36145e — not wired yet. + /// Earendil's `pi` coding agent. Pearl th-491e0c. + Pi, + /// Claude Code's `claude -p`. Pearl th-36145e (canceled per user + /// directive 2026-06-03 — Pi takes its slot). ClaudeCode, } @@ -577,7 +580,7 @@ async fn run_score_replay(args: ScoreReplayArgs) -> Result<()> { } async fn run_score_cleanup(args: ScoreCleanupArgs) -> Result<()> { - use smooth_bench::agent_driver::{AgentDriver, DispatchRequest, MockAgentDriver, OpenCodeDriver, SmoothDriver}; + use smooth_bench::agent_driver::{AgentDriver, DispatchRequest, MockAgentDriver, OpenCodeDriver, PiDriver, SmoothDriver}; use smooth_bench::score_cleanup::{ aggregate, destroyed_paths, discover_tasks, load_manifest, measure_bytes, run_setup, score_one_task, sweep_passed, AgentRunArtifacts, }; @@ -617,7 +620,10 @@ async fn run_score_cleanup(args: ScoreCleanupArgs) -> Result<()> { } AgentDriverKind::Opencode => Box::new(OpenCodeDriver::from_path()), AgentDriverKind::Smooth => Box::new(SmoothDriver::from_path()), - AgentDriverKind::ClaudeCode => anyhow::bail!("--driver=claude-code not wired yet — pearl th-36145e"), + AgentDriverKind::Pi => Box::new(PiDriver::from_path()), + AgentDriverKind::ClaudeCode => { + anyhow::bail!("--driver=claude-code canceled per user direction 2026-06-03 (pearl th-36145e); use --driver=pi instead (th-491e0c)") + } }; eprintln!("score-cleanup: driver = {}", driver.name()); if let Some(m) = args.model.as_deref() {