Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .smooth/dolt/pearls/.dolt/noms/journal.idx
Binary file not shown.
Binary file modified .smooth/dolt/pearls/.dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
Binary file not shown.
68 changes: 50 additions & 18 deletions crates/smooth-bigsmooth/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,30 @@ use tower_http::trace::TraceLayer;

use crate::events::{ClientEvent, ServerEvent};

/// Default idle timeout: 30 minutes.
const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 30 * 60;
/// Default idle timeout: 24 hours.
///
/// Was 30 minutes. Bumped under pearl `th-1b9b3e` after bench evidence
/// showed Big Smooth was silently shutting itself down mid-session —
/// pi + opencode (the bench's reference backends) have no daemon and
/// therefore no auto-shutdown, so smooth's 30-min cliff was a
/// competitive-parity loss masquerading as a "crashes unprompted"
/// symptom.
///
/// 24h keeps a safety net for forgotten-running dev sessions but
/// doesn't fire during a single work session. Override at boot via
/// `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=<seconds>` (set to `0` to
/// disable entirely; only honored when set in the daemon process's
/// own environment, which in sandboxed mode is the safehouse VM —
/// see project memory on env propagation).
const DEFAULT_IDLE_TIMEOUT_SECS: u64 = 24 * 60 * 60;

/// Read the idle-timeout env override. `None` = use default. `Some(0)`
/// = disabled (timeout never fires).
fn idle_timeout_from_env() -> Option<Duration> {
let raw = std::env::var("SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS").ok()?;
let secs: u64 = raw.parse().ok()?;
Some(Duration::from_secs(secs))
}

/// Default broadcast channel capacity.
const BROADCAST_CHANNEL_CAPACITY: usize = 256;
Expand Down Expand Up @@ -289,7 +311,7 @@ impl AppState {
session_store,
start_time: Instant::now(),
last_activity: Arc::new(Mutex::new(Instant::now())),
idle_timeout: Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS),
idle_timeout: idle_timeout_from_env().unwrap_or_else(|| Duration::from_secs(DEFAULT_IDLE_TIMEOUT_SECS)),
event_tx,
safehouse: None,
diver: None,
Expand Down Expand Up @@ -597,23 +619,33 @@ pub async fn start(mut state: AppState, addr: SocketAddr) -> anyhow::Result<()>
}
}

// Spawn idle timeout checker
let idle_state = state.clone();
tokio::spawn(async move {
loop {
tokio::time::sleep(Duration::from_secs(60)).await;
let elapsed = {
let Ok(last) = idle_state.last_activity.lock() else {
continue;
// Spawn idle timeout checker (pearl th-1b9b3e). Skip entirely when
// the timeout is zero — bench harness + long-running dev sessions
// set `SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0` to opt out of the
// 30-min auto-shutdown. Pi + OpenCode (the bench's reference
// backends) have no daemon timeout because they have no daemon —
// smooth's daemon model means every loop pause auto-killed the
// process before this knob existed.
if state.idle_timeout.is_zero() {
tracing::info!("Idle timeout disabled (SMOOTH_BIGSMOOTH_IDLE_TIMEOUT_SECS=0)");
} else {
let idle_state = state.clone();
tokio::spawn(async move {
loop {
tokio::time::sleep(Duration::from_secs(60)).await;
let elapsed = {
let Ok(last) = idle_state.last_activity.lock() else {
continue;
};
last.elapsed()
};
last.elapsed()
};
if elapsed > idle_state.idle_timeout {
tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64());
std::process::exit(0);
if elapsed > idle_state.idle_timeout {
tracing::info!("Idle timeout reached ({:.0}s), shutting down", idle_state.idle_timeout.as_secs_f64());
std::process::exit(0);
}
}
}
});
});
}

// Spawn orchestrator loop — continuously picks up ready pearls and
// dispatches operators. Skipped in direct mode: the orchestrator
Expand Down
6 changes: 5 additions & 1 deletion crates/smooth-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3190,7 +3190,11 @@ async fn cmd_code(
// the classifier per-message.
let working_dir = std::env::current_dir()?;
let _ = agent_name; // keep the typo-validation call; value isn't used in TUI mode
smooth_code::app::run_with_session(working_dir, resumed_session, agent).await
// Pearl th-20574a: thread the user's --model flag into the TUI
// path. Before this, `model` was parsed by clap then silently
// dropped here — every TaskStart picked the default smooth-coding
// alias regardless of what the user asked for.
smooth_code::app::run_with_session(working_dir, resumed_session, agent, model).await
}

fn cmd_hooks(cmd: HooksCommands) -> Result<()> {
Expand Down
23 changes: 20 additions & 3 deletions crates/smooth-code/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ fn write_bench_cost_sidecar_to(path: &std::path::Path, total_cost_usd: f64, iter
/// thread holding the lock).
#[allow(clippy::unused_async)] // async required for caller ergonomics and tokio::spawn inside
pub async fn run(working_dir: PathBuf) -> anyhow::Result<()> {
run_with_session(working_dir, None, None).await
run_with_session(working_dir, None, None, None).await
}

/// Run the TUI, optionally preloading a persisted session.
Expand All @@ -118,7 +118,7 @@ pub async fn run(working_dir: PathBuf) -> anyhow::Result<()> {
/// # Errors
/// Same as [`run`].
#[allow(clippy::unused_async)]
pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::session::Session>, agent: Option<String>) -> anyhow::Result<()> {
pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::session::Session>, agent: Option<String>, model: Option<String>) -> anyhow::Result<()> {
tui_debug(format!("app::run start, cwd={}", working_dir.display()));

// TTY pre-flight. If stdin or stdout isn't a TTY, the TUI will enter
Expand Down Expand Up @@ -213,6 +213,13 @@ pub async fn run_with_session(working_dir: PathBuf, resume: Option<crate::sessio
// classifier override the operator's deliberate choice.
initial_state.agent_pinned = true;
}
// Pearl th-20574a: thread the CLI's `--model` flag through to
// every TaskStart so bench harnesses (and any user who passes
// --model) actually get the requested model instead of silently
// falling back to smooth-coding's default alias.
if let Some(m) = model {
initial_state.model_override = Some(m);
}

let state = Arc::new(Mutex::new(initial_state));

Expand Down Expand Up @@ -1201,7 +1208,17 @@ async fn run_agent_streaming(message: &str, tx: mpsc::UnboundedSender<AgentEvent
out
};

let mut events = client.run_task(message, None, None, cwd.as_deref(), agent.as_deref(), prior_messages).await?;
// Pearl th-20574a: read the user's --model override from AppState
// so it actually reaches Big Smooth's routing layer. Was a literal
// `None` here; every TaskStart fell back to the smooth-coding alias
// regardless of CLI flag.
let model_override = {
let s = state.lock().unwrap_or_else(|e| e.into_inner());
s.model_override.clone()
};
let mut events = client
.run_task(message, model_override.as_deref(), None, cwd.as_deref(), agent.as_deref(), prior_messages)
.await?;

// Per-tool-name queues of (id, started_at, args). The runner emits
// a ToolCallStart, then the tool runs, then a ToolCallComplete —
Expand Down
12 changes: 12 additions & 0 deletions crates/smooth-code/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,17 @@ pub struct AppState {
pub user_scrolled: bool,
/// Display name of the current LLM model.
pub model_name: String,
/// User-supplied model override from `th code --model <X>`. When
/// `Some(_)`, every `TaskStart` dispatched to Big Smooth carries
/// this value verbatim in its `model` field; Big Smooth's routing
/// then resolves it against the configured providers (a smooth-*
/// alias OR a concrete model id like `deepseek-v4-flash`). Pearl
/// `th-20574a` — before this field existed, the CLI's `--model`
/// flag was silently dropped on the TUI path and every run used
/// the default smooth-coding alias regardless of what the user
/// asked for. `None` (the default) preserves the legacy behavior
/// of letting Big Smooth pick.
pub model_override: Option<String>,
/// Active lead role name (`fixer` / `mapper` / `oracle` / `heckler`).
/// Flows into every `TaskStart` so the runner applies the right
/// clearance set; rendered on the status bar so the user can see
Expand Down Expand Up @@ -385,6 +396,7 @@ impl AppState {
scroll_offset: 0,
user_scrolled: false,
model_name: "claude-sonnet-4".to_string(),
model_override: None,
agent_name: "fixer".to_string(),
agent_pinned: false,
verbose: false,
Expand Down
20 changes: 20 additions & 0 deletions crates/smooth-operator/src/cast/prompts/fixer.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
You are a general-purpose technical agent working inside a sandboxed workspace. You can write code, edit files, run shell commands, run tests, perform filesystem cleanup, investigate issues, answer questions — whatever the user actually asks for.

## Hard rule #0: DO THE THING THE USER ASKED FOR

This rule beats every other rule in this document.

The user's message — taken at face value, on its own merits — defines your task for this turn. Period.

- If the user says "delete __pycache__ dirs", your job is to delete __pycache__ dirs. NOT to run tests, NOT to look for failing tests, NOT to invent tests that aren't there.
- If the user says "tell me what this regex does", your job is to explain a regex. NOT to refactor it, NOT to write tests for it.
- If the user says "clean up the tmp/ directory", your job is filesystem cleanup. NOT a code-review of nearby files.

**Do NOT pivot.** Do NOT introduce a task the user didn't ask for. Do NOT say "I will now fix the remaining test failures" unless the user's literal request was to fix test failures. If you find yourself drafting a sentence like "I will now [do something the user didn't ask for]", stop and re-read the user's first message.

Test-running, test-fixing, and test-writing guidance appears further down in this prompt. Apply it ONLY when the user's request involves tests. For non-test tasks (cleanup, refactor, explanation, investigation, ops), skip the test sections entirely — they don't apply.

This rule exists because the model that runs this prompt has been observed to pattern-match the prompt's many test-related sections and pivot to "let me fix the tests" on tasks that have nothing to do with tests. That's a bug, not a feature. The user asked for something specific. Do that.

---

You are a coding agent working inside a sandboxed workspace. You can write code, edit files, run commands, and run tests. You can also just answer questions about code — analytical / advisory mode is fine when that's what the user is actually asking for.

## When the user's message starts with `## Skill: <name>`
Expand Down
78 changes: 78 additions & 0 deletions docs/Engineering/Big-Smooth-Direct-Mode.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Big Smooth — Direct (host) vs Sandboxed (safehouse VM) mode

Big Smooth runs in one of two modes. Choose at `th up` time.

| | `th up` (sandboxed, default) | `th up direct` |
|---|---|---|
| **Boot time** | ~30s — boots a safehouse microVM + the in-VM cast | **~0.3s** — daemon starts directly on the host |
| **Isolation** | Strong — agent runs inside a microVM, safehouse mediates filesystem/network | None — agent is a host subprocess; tools execute against the host filesystem |
| **When to use** | Untrusted code, agent dispatches you don't fully control, CI runners that need defense in depth | Pre-trusted environments — dedicated devbox, CI runner you own, bench harnesses |
| **Idle timeout default** | 24 h (was 30 min — pearl `th-1b9b3e`) | 24 h |
| **Native runner needed?** | No — runner is baked into the safehouse OCI image | **Yes** — build with `cargo build --release -p smooai-smooth-operator-runner` and either auto-discovery picks it up from `~/.cargo/shared-target/release/smooth-operator-runner`, or you set `SMOOTH_OPERATOR_RUNNER_NATIVE=/abs/path/to/runner` before `th up direct` |

## Why this matters for parity with pi + opencode

Pi (`@earendil-works/pi-coding-agent`) and OpenCode (`opencode`) both boot in ~3s
and have no daemon model. Smooth's sandboxed default looked like a "30s boot,
sometimes crashes" agent against them. Direct mode is a near-100× boot speedup
that brings smooth into the same launch-time class as pi + opencode for
pre-trusted use cases (dev machines, bench harnesses).

## Smoke test

```bash
# Build the native runner once per checkout.
cargo build --release -p smooai-smooth-operator-runner

# Start in direct mode.
th down
SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner \
th up direct

# Confirm: th status should report healthy in under a second.
th status
```

The runner-bin auto-discovery has a paper-cut tracked under pearl `th-e74aa6`:
when the env var is unset the error message names the build command but doesn't
mention that auto-discovery from `~/.cargo/shared-target/release/` will work if
you've built it. Either approach gets you there.

## Bench harness usage

`smooth-bench` doesn't care which mode Big Smooth is in — both expose the same
HTTP API at `localhost:4400`. The `SmoothDriver` in
`crates/smooth-bench/src/agent_driver.rs` just spawns `th code` against the
running daemon. So:

```bash
# Sandboxed mode (default — slow boot, more isolation)
th up
cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth …

# Direct mode (fast boot, host trust)
th down
SMOOTH_OPERATOR_RUNNER_NATIVE=~/.cargo/shared-target/release/smooth-operator-runner th up direct
cargo run -p smooai-smooth-bench -- score-cleanup --driver=smooth …
```

Result JSON includes `dispatch="direct"` or `dispatch="sandboxed"` in the daemon
log (`~/.smooth/log/th.log`) so post-hoc you can tell which mode each result
came from.

## Recent bench numbers

`deepseek-v4-flash` via `llm.smoo.ai`, strict coach, 4 cleanup fixtures
(`cleanup-impossible-task`, `cleanup-pycache-debris`, `cleanup-disk-bloat`,
`cleanup-node-modules-orphans`):

| backend | aggregate | boot time | notes |
|---|---|---|---|
| mock | 1.000 | n/a | bash baseline |
| **pi** | **1.000** | ~3s | new reference high-water |
| opencode | ≥0.93 | ~3s | reliable on tested fixtures |
| **smooth-direct** | **0.850** | **~0.3s** | beats sandboxed; matches pi boot time |
| smooth-sandboxed | 0.789 | ~30s | run-to-run variance still present |

Pearls related: `th-0fc29f` (boot time, this doc closes it),
`th-1b9b3e` (idle timeout, closed), `th-6e361d` (pycache variance, open).