SmooAI · brentrager · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.smooth/dolt/pearls/.dolt/noms/journal.idx b/.smooth/dolt/pearls/.dolt/noms/journal.idx
diff --git a/.smooth/dolt/pearls/.dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv b/.smooth/dolt/pearls/.dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
diff --git a/crates/smooth-bigsmooth/src/server.rs b/crates/smooth-bigsmooth/src/server.rs
@@ -21,8 +21,30 @@ use tower_http::trace::TraceLayer;
 
 use crate::events::{ClientEvent, ServerEvent};
 
-/// Default idle timeout: 30 minutes.
-const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 30 * 60;
+/// Default idle timeout: 24 hours.
+///
+/// Was 30 minutes. Bumped under pearl `th-1b9b3e` after bench evidence
+/// showed Big Smooth was silently shutting itself down mid-session —
+/// pi + opencode (the bench's reference backends) have no daemon and
+/// therefore no auto-shutdown, so smooth's 30-min cliff was a
+/// competitive-parity loss masquerading as a "crashes unprompted"
+/// symptom.
+///
+/// 24h keeps a safety net for forgotten-running dev sessions but
+/// doesn't fire during a single work session. Override at boot via
+/// `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=<seconds>` (set to `0` to
+/// disable entirely; only honored when set in the daemon process's
+/// own environment, which in sandboxed mode is the safehouse VM —
+/// see project memory on env propagation).
+const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 24 * 60 * 60;
+
+/// Read the idle-timeout env override. `None` = use default. `Some(0)`
+/// = disabled (timeout never fires).
+fn idle_timeout_from_env() -> Option<Duration> {
+    let raw = std::env::var("SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS").ok()?;
+    let secs: u64 = raw.parse().ok()?;
+    Some(Duration::from_secs(secs))
+}
 
 /// Default broadcast channel capacity.
 const BROADCAST_CHANNEL_CAPACITY: usize = 256;
@@ -289,7 +311,7 @@ impl AppState {
             session_store,
             start_time: Instant::now(),
             last_activity: Arc::new(Mutex::new(Instant::now())),
-            idle_timeout: Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS),
+            idle_timeout: idle_timeout_from_env().unwrap_or_else(|| Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS)),
             event_tx,
             safehouse: None,
             diver: None,
@@ -597,23 +619,33 @@ pub async fn start(mut state: AppState, addr: SocketAddr) -> anyhow::Result<()>
         }
     }
 
-    // Spawn idle timeout checker
-    let idle_state = state.clone();
-    tokio::spawn(async move {
-        loop {
-            tokio::time::sleep(Duration::from_secs(60)).await;
-            let elapsed = {
-                let Ok(last) = idle_state.last_activity.lock() else {
-                    continue;
+    // Spawn idle timeout checker (pearl th-1b9b3e). Skip entirely when
+    // the timeout is zero — bench harness + long-running dev sessions
+    // set `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0` to opt out of the
+    // 30-min auto-shutdown. Pi + OpenCode (the bench's reference
+    // backends) have no daemon timeout because they have no daemon —
+    // smooth's daemon model means every loop pause auto-killed the
+    // process before this knob existed.
+    if state.idle_timeout.is_zero() {
+        tracing::info!("Idle timeout disabled (SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0)");
+    } else {
+        let idle_state = state.clone();
+        tokio::spawn(async move {
+            loop {
+                tokio::time::sleep(Duration::from_secs(60)).await;
+                let elapsed = {
+                    let Ok(last) = idle_state.last_activity.lock() else {
+                        continue;
+                    };
+                    last.elapsed()
                 };
-                last.elapsed()
-            };
-            if elapsed > idle_state.idle_timeout {
-                tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64());
-                std::process::exit(0);
+                if elapsed > idle_state.idle_timeout {
+                    tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64());
+                    std::process::exit(0);
+                }
             }
-        }
-    });
+        });
+    }
 
     // Spawn orchestrator loop — continuously picks up ready pearls and
     // dispatches operators. Skipped in direct mode: the orchestrator

diff --git a/crates/smooth-cli/src/main.rs b/crates/smooth-cli/src/main.rs
@@ -3190,7 +3190,11 @@ async fn cmd_code(
     // the classifier per-message.
     let working_dir = std::env::current_dir()?;
     let _ = agent_name; // keep the typo-validation call; value isn't used in TUI mode
-    smooth_code::app::run_with_session(working_dir, resumed_session, agent).await
+    // Pearl th-20574a: thread the user's --model flag into the TUI
+    // path. Before this, `model` was parsed by clap then silently
+    // dropped here — every TaskStart picked the default smooth-coding
+    // alias regardless of what the user asked for.
+    smooth_code::app::run_with_session(working_dir, resumed_session, agent, model).await
 }
 
 fn cmd_hooks(cmd: HooksCommands) -> Result<()> {

diff --git a/crates/smooth-code/src/app.rs b/crates/smooth-code/src/app.rs
@@ -102,7 +102,7 @@ fn write_bench_cost_sidecar_to(path: &std::path::Path, total_cost_usd: f64, iter
 /// thread holding the lock).
 #[allow(clippy::unused_async)] // async required for caller ergonomics and tokio::spawn inside
 pub async fn run(working_dir: PathBuf) -> anyhow::Result<()> {
-    run_with_session(working_dir, None, None).await
+    run_with_session(working_dir, None, None, None).await
 }
 
 /// Run the TUI, optionally preloading a persisted session.
@@ -118,7 +118,7 @@ pub async fn run(working_dir: PathBuf) -> anyhow::Result<()> {
 /// # Errors
 /// Same as [`run`].
 #[allow(clippy::unused_async)]
-pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::session::Session>, agent: Option<String>) -> anyhow::Result<()> {
+pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::session::Session>, agent: Option<String>, model: Option<String>) -> anyhow::Result<()> {
     tui_debug(format!("app::run start, cwd={}", working_dir.display()));
 
     // TTY pre-flight. If stdin or stdout isn't a TTY, the TUI will enter
@@ -213,6 +213,13 @@ pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::sessio
         // classifier override the operator's deliberate choice.
         initial_state.agent_pinned = true;
     }
+    // Pearl th-20574a: thread the CLI's `--model` flag through to
+    // every TaskStart so bench harnesses (and any user who passes
+    // --model) actually get the requested model instead of silently
+    // falling back to smooth-coding's default alias.
+    if let Some(m) = model {
+        initial_state.model_override = Some(m);
+    }
 
     let state = Arc::new(Mutex::new(initial_state));
 
@@ -1201,7 +1208,17 @@ async fn run_agent_streaming(message: &str, tx: mpsc::UnboundedSender<AgentEvent
         out
     };
 
-    let mut events = client.run_task(message, None, None, cwd.as_deref(), agent.as_deref(), prior_messages).await?;
+    // Pearl th-20574a: read the user's --model override from AppState
+    // so it actually reaches Big Smooth's routing layer. Was a literal
+    // `None` here; every TaskStart fell back to the smooth-coding alias
+    // regardless of CLI flag.
+    let model_override = {
+        let s = state.lock().unwrap_or_else(|e| e.into_inner());
+        s.model_override.clone()
+    };
+    let mut events = client
+        .run_task(message, model_override.as_deref(), None, cwd.as_deref(), agent.as_deref(), prior_messages)
+        .await?;
 
     // Per-tool-name queues of (id, started_at, args). The runner emits
     // a ToolCallStart, then the tool runs, then a ToolCallComplete —

diff --git a/crates/smooth-code/src/state.rs b/crates/smooth-code/src/state.rs
@@ -297,6 +297,17 @@ pub struct AppState {
     pub user_scrolled: bool,
     /// Display name of the current LLM model.
     pub model_name: String,
+    /// User-supplied model override from `th code --model <X>`. When
+    /// `Some(_)`, every `TaskStart` dispatched to Big Smooth carries
+    /// this value verbatim in its `model` field; Big Smooth's routing
+    /// then resolves it against the configured providers (a smooth-*
+    /// alias OR a concrete model id like `deepseek-v4-flash`). Pearl
+    /// `th-20574a` — before this field existed, the CLI's `--model`
+    /// flag was silently dropped on the TUI path and every run used
+    /// the default smooth-coding alias regardless of what the user
+    /// asked for. `None` (the default) preserves the legacy behavior
+    /// of letting Big Smooth pick.
+    pub model_override: Option<String>,
     /// Active lead role name (`fixer` / `mapper` / `oracle` / `heckler`).
     /// Flows into every `TaskStart` so the runner applies the right
     /// clearance set; rendered on the status bar so the user can see
@@ -385,6 +396,7 @@ impl AppState {
             scroll_offset: 0,
             user_scrolled: false,
             model_name: "claude-sonnet-4".to_string(),
+            model_override: None,
             agent_name: "fixer".to_string(),
             agent_pinned: false,
             verbose: false,

diff --git a/crates/smooth-operator/src/cast/prompts/fixer.txt b/crates/smooth-operator/src/cast/prompts/fixer.txt
@@ -1,3 +1,23 @@
+You are a general-purpose technical agent working inside a sandboxed workspace. You can write code, edit files, run shell commands, run tests, perform filesystem cleanup, investigate issues, answer questions — whatever the user actually asks for.
+
+## Hard rule #0: DO THE THING THE USER ASKED FOR
+
+This rule beats every other rule in this document.
+
+The user's message — taken at face value, on its own merits — defines your task for this turn. Period.
+
+- If the user says "delete __pycache__ dirs", your job is to delete __pycache__ dirs. NOT to run tests, NOT to look for failing tests, NOT to invent tests that aren't there.
+- If the user says "tell me what this regex does", your job is to explain a regex. NOT to refactor it, NOT to write tests for it.
+- If the user says "clean up the tmp/ directory", your job is filesystem cleanup. NOT a code-review of nearby files.
+
+**Do NOT pivot.** Do NOT introduce a task the user didn't ask for. Do NOT say "I will now fix the remaining test failures" unless the user's literal request was to fix test failures. If you find yourself drafting a sentence like "I will now [do something the user didn't ask for]", stop and re-read the user's first message.
+
+Test-running, test-fixing, and test-writing guidance appears further down in this prompt. Apply it ONLY when the user's request involves tests. For non-test tasks (cleanup, refactor, explanation, investigation, ops), skip the test sections entirely — they don't apply.
+
+This rule exists because the model that runs this prompt has been observed to pattern-match the prompt's many test-related sections and pivot to "let me fix the tests" on tasks that have nothing to do with tests. That's a bug, not a feature. The user asked for something specific. Do that.
+
+---
+
 You are a coding agent working inside a sandboxed workspace. You can write code, edit files, run commands, and run tests. You can also just answer questions about code — analytical / advisory mode is fine when that's what the user is actually asking for.
 
 ## When the user's message starts with `## Skill: <name>`

diff --git a/docs/Engineering/Big-Smooth-Direct-Mode.md b/docs/Engineering/Big-Smooth-Direct-Mode.md
@@ -0,0 +1,78 @@
+# Big Smooth — Direct (host) vs Sandboxed (safehouse VM) mode
+
+Big Smooth runs in one of two modes. Choose at `th up` time.
+
+| | `th up` (sandboxed, default) | `th up direct` |
+|---|---|---|
+| **Boot time** | ~30s — boots a safehouse microVM + the in-VM cast | **~0.3s** — daemon starts directly on the host |
+| **Isolation** | Strong — agent runs inside a microVM, safehouse mediates filesystem/network | None — agent is a host subprocess; tools execute against the host filesystem |
+| **When to use** | Untrusted code, agent dispatches you don't fully control, CI runners that need defense in depth | Pre-trusted environments — dedicated devbox, CI runner you own, bench harnesses |
+| **Idle timeout default** | 24 h (was 30 min — pearl `th-1b9b3e`) | 24 h |
+| **Native runner needed?** | No — runner is baked into the safehouse OCI image | **Yes** — build with `cargo build --release -p smooai-smooth-operator-runner` and either auto-discovery picks it up from `~/.cargo/shared-target/release/smooth-operator-runner`, or you set `SMOOTH_OPERATOR_RUNNER_NATIVE=/abs/path/to/runner` before `th up direct` |
+
+## Why this matters for parity with pi + opencode
+
+Pi (`@earendil-works/pi-coding-agent`) and OpenCode (`opencode`) both boot in ~3s
+and have no daemon model. Smooth's sandboxed default looked like a "30s boot,
+sometimes crashes" agent against them. Direct mode is a near-100× boot speedup
+that brings smooth into the same launch-time class as pi + opencode for
+pre-trusted use cases (dev machines, bench harnesses).
+
+## Smoke test
+
+```bash
+# Build the native runner once per checkout.
+cargo build --release -p smooai-smooth-operator-runner
+
+# Start in direct mode.
+th down
+SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner \
+  th up direct
+
+# Confirm: th status should report healthy in under a second.
+th status
+```
+
+The runner-bin auto-discovery has a paper-cut tracked under pearl `th-e74aa6`:
+when the env var is unset the error message names the build command but doesn't
+mention that auto-discovery from `~/.cargo/shared-target/release/` will work if
+you've built it. Either approach gets you there.
+
+## Bench harness usage
+
+`smooth-bench` doesn't care which mode Big Smooth is in — both expose the same
+HTTP API at `localhost:4400`. The `SmoothDriver` in
+`crates/smooth-bench/src/agent_driver.rs` just spawns `th code` against the
+running daemon. So:
+
+```bash
+# Sandboxed mode (default — slow boot, more isolation)
+th up
+cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth …
+
+# Direct mode (fast boot, host trust)
+th down
+SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner th up direct
+cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth …
+```
+
+Result JSON includes `dispatch="direct"` or `dispatch="sandboxed"` in the daemon
+log (`~/.smooth/log/th.log`) so post-hoc you can tell which mode each result
+came from.
+
+## Recent bench numbers
+
+`deepseek-v4-flash` via `llm.smoo.ai`, strict coach, 4 cleanup fixtures
+(`cleanup-impossible-task`, `cleanup-pycache-debris`, `cleanup-disk-bloat`,
+`cleanup-node-modules-orphans`):
+
+| backend | aggregate | boot time | notes |
+|---|---|---|---|
+| mock | 1.000 | n/a | bash baseline |
+| **pi** | **1.000** | ~3s | new reference high-water |
+| opencode | ≥0.93 | ~3s | reliable on tested fixtures |
+| **smooth-direct** | **0.850** | **~0.3s** | beats sandboxed; matches pi boot time |
+| smooth-sandboxed | 0.789 | ~30s | run-to-run variance still present |
+
+Pearls related: `th-0fc29f` (boot time, this doc closes it),
+`th-1b9b3e` (idle timeout, closed), `th-6e361d` (pycache variance, open).