diff --git a/crates/smooth-bigsmooth/src/server.rs b/crates/smooth-bigsmooth/src/server.rs index 3eacc009..da9dc97e 100644 --- a/crates/smooth-bigsmooth/src/server.rs +++ b/crates/smooth-bigsmooth/src/server.rs @@ -21,8 +21,30 @@ use tower_http::trace::TraceLayer; use crate::events::{ClientEvent, ServerEvent}; -/// Default idle timeout: 30 minutes. -const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 30 * 60; +/// Default idle timeout: 24 hours. +/// +/// Was 30 minutes. Bumped under pearl `th-1b9b3e` after bench evidence +/// showed Big Smooth was silently shutting itself down mid-session — +/// pi + opencode (the bench's reference backends) have no daemon and +/// therefore no auto-shutdown, so smooth's 30-min cliff was a +/// competitive-parity loss masquerading as a "crashes unprompted" +/// symptom. +/// +/// 24h keeps a safety net for forgotten-running dev sessions but +/// doesn't fire during a single work session. Override at boot via +/// `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=` (set to `0` to +/// disable entirely; only honored when set in the daemon process's +/// own environment, which in sandboxed mode is the safehouse VM — +/// see project memory on env propagation). +const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 24 * 60 * 60; + +/// Read the idle-timeout env override. `None` = use default. `Some(0)` +/// = disabled (timeout never fires). +fn idle_timeout_from_env() -> Option { + let raw = std::env::var("SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS").ok()?; + let secs: u64 = raw.parse().ok()?; + Some(Duration::from_secs(secs)) +} /// Default broadcast channel capacity. const BROADCAST_CHANNEL_CAPACITY: usize = 256; @@ -289,7 +311,7 @@ impl AppState { session_store, start_time: Instant::now(), last_activity: Arc::new(Mutex::new(Instant::now())), - idle_timeout: Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS), + idle_timeout: idle_timeout_from_env().unwrap_or_else(|| Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS)), event_tx, safehouse: None, diver: None, @@ -597,23 +619,33 @@ pub async fn start(mut state: AppState, addr: SocketAddr) -> anyhow::Result<()> } } - // Spawn idle timeout checker - let idle_state = state.clone(); - tokio::spawn(async move { - loop { - tokio::time::sleep(Duration::from_secs(60)).await; - let elapsed = { - let Ok(last) = idle_state.last_activity.lock() else { - continue; + // Spawn idle timeout checker (pearl th-1b9b3e). Skip entirely when + // the timeout is zero — bench harness + long-running dev sessions + // set `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0` to opt out of the + // 30-min auto-shutdown. Pi + OpenCode (the bench's reference + // backends) have no daemon timeout because they have no daemon — + // smooth's daemon model means every loop pause auto-killed the + // process before this knob existed. + if state.idle_timeout.is_zero() { + tracing::info!("Idle timeout disabled (SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0)"); + } else { + let idle_state = state.clone(); + tokio::spawn(async move { + loop { + tokio::time::sleep(Duration::from_secs(60)).await; + let elapsed = { + let Ok(last) = idle_state.last_activity.lock() else { + continue; + }; + last.elapsed() }; - last.elapsed() - }; - if elapsed > idle_state.idle_timeout { - tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64()); - std::process::exit(0); + if elapsed > idle_state.idle_timeout { + tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64()); + std::process::exit(0); + } } - } - }); + }); + } // Spawn orchestrator loop — continuously picks up ready pearls and // dispatches operators. Skipped in direct mode: the orchestrator diff --git a/docs/Engineering/Big-Smooth-Direct-Mode.md b/docs/Engineering/Big-Smooth-Direct-Mode.md new file mode 100644 index 00000000..d4fbd16c --- /dev/null +++ b/docs/Engineering/Big-Smooth-Direct-Mode.md @@ -0,0 +1,78 @@ +# Big Smooth — Direct (host) vs Sandboxed (safehouse VM) mode + +Big Smooth runs in one of two modes. Choose at `th up` time. + +| | `th up` (sandboxed, default) | `th up direct` | +|---|---|---| +| **Boot time** | ~30s — boots a safehouse microVM + the in-VM cast | **~0.3s** — daemon starts directly on the host | +| **Isolation** | Strong — agent runs inside a microVM, safehouse mediates filesystem/network | None — agent is a host subprocess; tools execute against the host filesystem | +| **When to use** | Untrusted code, agent dispatches you don't fully control, CI runners that need defense in depth | Pre-trusted environments — dedicated devbox, CI runner you own, bench harnesses | +| **Idle timeout default** | 24 h (was 30 min — pearl `th-1b9b3e`) | 24 h | +| **Native runner needed?** | No — runner is baked into the safehouse OCI image | **Yes** — build with `cargo build --release -p smooai-smooth-operator-runner` and either auto-discovery picks it up from `~/.cargo/shared-target/release/smooth-operator-runner`, or you set `SMOOTH_OPERATOR_RUNNER_NATIVE=/abs/path/to/runner` before `th up direct` | + +## Why this matters for parity with pi + opencode + +Pi (`@earendil-works/pi-coding-agent`) and OpenCode (`opencode`) both boot in ~3s +and have no daemon model. Smooth's sandboxed default looked like a "30s boot, +sometimes crashes" agent against them. Direct mode is a near-100× boot speedup +that brings smooth into the same launch-time class as pi + opencode for +pre-trusted use cases (dev machines, bench harnesses). + +## Smoke test + +```bash +# Build the native runner once per checkout. +cargo build --release -p smooai-smooth-operator-runner + +# Start in direct mode. +th down +SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner \ + th up direct + +# Confirm: th status should report healthy in under a second. +th status +``` + +The runner-bin auto-discovery has a paper-cut tracked under pearl `th-e74aa6`: +when the env var is unset the error message names the build command but doesn't +mention that auto-discovery from `~/.cargo/shared-target/release/` will work if +you've built it. Either approach gets you there. + +## Bench harness usage + +`smooth-bench` doesn't care which mode Big Smooth is in — both expose the same +HTTP API at `localhost:4400`. The `SmoothDriver` in +`crates/smooth-bench/src/agent_driver.rs` just spawns `th code` against the +running daemon. So: + +```bash +# Sandboxed mode (default — slow boot, more isolation) +th up +cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth … + +# Direct mode (fast boot, host trust) +th down +SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner th up direct +cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth … +``` + +Result JSON includes `dispatch="direct"` or `dispatch="sandboxed"` in the daemon +log (`~/.smooth/log/th.log`) so post-hoc you can tell which mode each result +came from. + +## Recent bench numbers + +`deepseek-v4-flash` via `llm.smoo.ai`, strict coach, 4 cleanup fixtures +(`cleanup-impossible-task`, `cleanup-pycache-debris`, `cleanup-disk-bloat`, +`cleanup-node-modules-orphans`): + +| backend | aggregate | boot time | notes | +|---|---|---|---| +| mock | 1.000 | n/a | bash baseline | +| **pi** | **1.000** | ~3s | new reference high-water | +| opencode | ≥0.93 | ~3s | reliable on tested fixtures | +| **smooth-direct** | **0.850** | **~0.3s** | beats sandboxed; matches pi boot time | +| smooth-sandboxed | 0.789 | ~30s | run-to-run variance still present | + +Pearls related: `th-0fc29f` (boot time, this doc closes it), +`th-1b9b3e` (idle timeout, closed), `th-6e361d` (pycache variance, open).