diff --git a/.changeset/cutover-published-engine-014-rehome-harness.md b/.changeset/cutover-published-engine-014-rehome-harness.md new file mode 100644 index 00000000..7f90703c --- /dev/null +++ b/.changeset/cutover-published-engine-014-rehome-harness.md @@ -0,0 +1,42 @@ +--- +'@smooai/smooth': patch +--- + +Cut smooth over to the published `smooai-smooth-operator-core` v0.14.0 (crates.io); re-home the th-code harness into smooth's own crates + +This is the final PR of the engine-decouple program (SMOODEV-1790, PR 4/4). The +engine `smooai-smooth-operator-core` is now published on crates.io at `0.14.0` — +a clean, GENERIC agent engine with the `th code` coding harness REMOVED. +Previously smooth depended on the engine via a git rev (`bb9a256`) that still +carried the harness, which is why it kept building. + +- **Engine dep switched to crates.io 0.14.0.** Root `Cargo.toml`: + `smooth-operator = { git = …, rev = "bb9a256…" }` → + `smooth-operator = { version = "0.14.0", package = "smooai-smooth-operator-core" }`. + The dep KEY stays `smooth-operator` so the `use smooth_operator::…` imports for + the generic engine API are unchanged. `Cargo.lock` now resolves the engine from + `registry+https://github.com/rust-lang/crates.io-index` (checksum-pinned), not a + git source — the git-rev bridge is gone. + +- **New `smooth-cast` crate** re-homes the bits the engine dropped, built on the + engine's generic public API (`Agent`/`ProviderRegistry`/`ToolRegistry`/generic + `Cast`/`OperatorRole`/`Clearance`): + - `coding_workflow` — the `th code` single-agent outer loop + (`run_coding_workflow`, `task_text_has_cleanup_intent`, …). + - `skills` — skill discovery (`discover`, `SkillScope`, `SkillSource`, `Skill`) + plus the built-in `create-skill` skill. + - `cast` — the four coding-harness cast roles the generic engine no longer ships + (`fixer`, `oracle`, `chief`, `intent_classifier`), and a `cast::builtin()` that + returns them on top of the engine's generic built-in roles. All moved tests came + with the code. + +- **Consumers repointed** to `smooth-cast`: `smooth-operative` (coding_workflow + + `fixer` role resolution), `smooth-code` (skills + `chief`/`intent_classifier` + routing), `smooth-cli` (skills + `--agent` role resolution), `smooth-bigsmooth` + (skills + session auto-naming). Every site that did `Cast::builtin().get("fixer"| + "oracle"|"chief"|"intent_classifier")` now uses `smooth_cast::cast::builtin()`. + +- The Big-Smooth reporter hooks the engine also dropped stay deleted — verified + zero smooth consumers (`with_reporter`/`BigSmoothReporter`/`ReporterEvent`/ + `report_to_bigsmooth`/the `bigsmooth` engine feature). smooth's own + `smooth-bigsmooth` gRPC crate is unrelated and untouched. diff --git a/Cargo.lock b/Cargo.lock index 51556d77..8b0d8cff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6123,6 +6123,7 @@ dependencies = [ "serde_json", "smooai-smooth-archivist", "smooai-smooth-bootstrap-bill", + "smooai-smooth-cast", "smooai-smooth-code", "smooai-smooth-diver", "smooai-smooth-goalie", @@ -6168,6 +6169,21 @@ dependencies = [ "uuid", ] +[[package]] +name = "smooai-smooth-cast" +version = "0.13.7" +dependencies = [ + "anyhow", + "dirs-next", + "serde", + "serde_json", + "serde_yml", + "smooai-smooth-operator-core", + "tempfile", + "tokio", + "tracing", +] + [[package]] name = "smooai-smooth-cli" version = "0.13.7" @@ -6194,6 +6210,7 @@ dependencies = [ "smooai-smooth-bench", "smooai-smooth-bigsmooth", "smooai-smooth-bootstrap-bill", + "smooai-smooth-cast", "smooai-smooth-code", "smooai-smooth-diver", "smooai-smooth-operator-core", @@ -6232,6 +6249,7 @@ dependencies = [ "serde_json", "similar", "smooai-smooth-bigsmooth", + "smooai-smooth-cast", "smooai-smooth-narc", "smooai-smooth-operator-core", "smooai-smooth-pearls", @@ -6364,6 +6382,7 @@ dependencies = [ "serde", "serde_json", "similar", + "smooai-smooth-cast", "smooai-smooth-goalie", "smooai-smooth-narc", "smooai-smooth-operator-core", @@ -6381,8 +6400,9 @@ dependencies = [ [[package]] name = "smooai-smooth-operator-core" -version = "0.13.7" -source = "git+https://github.com/SmooAI/smooth-operator-core.git?rev=bb9a2565f0187fbd860240868c5775bd1205764d#bb9a2565f0187fbd860240868c5775bd1205764d" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60cac82deb3b84139783af4c3494717d184e1f9c5372945462191053c37dea39" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index e3a5f564..6ca5d467 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -178,18 +178,25 @@ prost-types = "0.13" # relevant to the bind-mount silent drop tracked in th-dd0cef). microsandbox = "0.4" -# Published agent engine (github.com/SmooAI/smooth-operator-core). +# Published agent engine (crates.io: smooai-smooth-operator-core). # SMOODEV-1787 (PR 1/4, dual-engine collapse): smooth consumes the public # `smooai-smooth-operator-core` engine instead of an in-tree copy. The in-tree # `crates/smooth-operator` copy was deleted. The dep KEY stays `smooth-operator` # and the lib is package-aliased back to `smooth_operator`, so the ~12 consumers' -# `use smooth_operator::…` imports keep working unchanged. +# `use smooth_operator::…` imports for the GENERIC engine API keep working +# unchanged. # -# Rev-pinned git dep (NOT a sibling path dep): a `path = "../smooth-operator-core/…"` -# form only resolves on a dev laptop and breaks every CI `cargo metadata` run — -# the exact failure SMOODEV-1464 hit with client-shared (see that dep below). -# Bump the rev when the engine changes. -smooth-operator = { git = "https://github.com/SmooAI/smooth-operator-core.git", rev = "bb9a2565f0187fbd860240868c5775bd1205764d", package = "smooai-smooth-operator-core" } +# SMOODEV-1790 (PR 4/4, final cutover): switched from the rev-pinned git dep on +# the OLD engine (rev bb9a256, which still carried the th-code harness) to the +# published crates.io release `0.14.0` — a clean GENERIC engine with the harness +# REMOVED. The harness bits the engine dropped (the coding workflow, skill +# discovery, and the fixer/oracle/chief/intent_classifier cast roles) now live +# in the smooth-owned `smooth-cast` crate below, built on the engine's generic +# public API. +smooth-operator = { version = "0.14.0", package = "smooai-smooth-operator-core" } +# smooth-owned coding-harness extensions to the generic engine (re-homed from +# the engine when it went generic at 0.14.0). See crates/smooth-cast. +smooth-cast = { version = "0.13.7", path = "crates/smooth-cast", package = "smooai-smooth-cast" } smooth-bigsmooth = { version = "0.13.7", path = "crates/smooth-bigsmooth", package = "smooai-smooth-bigsmooth" } smooth-policy = { path = "crates/smooth-policy", version = "0.13.7", package = "smooai-smooth-policy" } smooth-web = { version = "0.13.7", path = "crates/smooth-web", package = "smooai-smooth-web" } diff --git a/crates/smooth-bigsmooth/Cargo.toml b/crates/smooth-bigsmooth/Cargo.toml index 689b7dc2..f36b1a60 100644 --- a/crates/smooth-bigsmooth/Cargo.toml +++ b/crates/smooth-bigsmooth/Cargo.toml @@ -24,6 +24,8 @@ direct-sandbox = ["smooth-bootstrap-bill/server"] smooth-pearls.workspace = true smooth-bootstrap-bill = { workspace = true, default-features = false } smooth-operator.workspace = true +# skills discovery + the smooth cast roles (re-homed from the engine at 0.14.0) +smooth-cast.workspace = true smooth-code = { path = "../smooth-code", package = "smooai-smooth-code" } smooth-policy.workspace = true smooth-web = { path = "../smooth-web", package = "smooai-smooth-web" } diff --git a/crates/smooth-bigsmooth/src/chat_tools.rs b/crates/smooth-bigsmooth/src/chat_tools.rs index d4961439..64d84488 100644 --- a/crates/smooth-bigsmooth/src/chat_tools.rs +++ b/crates/smooth-bigsmooth/src/chat_tools.rs @@ -289,7 +289,7 @@ impl Tool for TeammateSpawnTool { "extra_prompt": { "type": "string", "description": "Optional extra instruction appended after the context_brief. Use this for fine-grained constraints (e.g. 'use the Rust 2021 edition', 'don't touch the migrations directory')." }, "budget_usd": { "type": "number", "description": "Optional cost cap in USD for this dispatch." }, "working_dir": { "type": "string", "description": "Working directory for the teammate's sandbox. Pass the most specific absolute path that scopes the work — e.g. for 'clone repo X to ~/dev/foo/X' pass `~/dev/foo`. Never pass a directory as broad as `~` or `/`; the runner can stall enumerating that much filesystem." }, - "role": { "type": "string", "description": "Optional cast role to spawn under (e.g. `fixer`, `mapper`, `oracle`, `heckler` — see smooth-operator/src/cast). Affects permissions, prompt, and routing slot." }, + "role": { "type": "string", "description": "Optional cast role to spawn under (e.g. `fixer`, `mapper`, `oracle`, `heckler` — resolved via smooth_cast::cast::builtin()). Affects permissions, prompt, and routing slot." }, "model": { "type": "string", "description": "DO NOT SET unless you have a specific reason. Default = role's slot (smooth-coding for `fixer`) which is the best balance of speed and tool-call reliability. Avoid `smooth-fast-gemini` — it can't reliably emit native tool calls and will wedge the runner. `smooth-reasoning` is for genuinely hard problems only." } } }), diff --git a/crates/smooth-bigsmooth/src/policy.rs b/crates/smooth-bigsmooth/src/policy.rs index b906272e..4ef66e08 100644 --- a/crates/smooth-bigsmooth/src/policy.rs +++ b/crates/smooth-bigsmooth/src/policy.rs @@ -603,8 +603,9 @@ fn registered_tool_names() -> Vec { } /// Read-only subset — what reasoning roles (oracle, mapper, heckler) get. -/// Must stay in sync with `read_only_tools()` in -/// `crates/smooth-operator/src/cast/mod.rs`. +/// Must stay in sync with `read_only_tools()` in the engine's +/// `cast/mod.rs` (mapper/heckler) and the smooth re-homed copy in +/// `crates/smooth-cast/src/cast.rs` (oracle). fn read_only_tool_names() -> Vec { vec![ "read_file".into(), diff --git a/crates/smooth-bigsmooth/src/server.rs b/crates/smooth-bigsmooth/src/server.rs index 4ae4c855..fc256419 100644 --- a/crates/smooth-bigsmooth/src/server.rs +++ b/crates/smooth-bigsmooth/src/server.rs @@ -3550,7 +3550,7 @@ fn extract_skill_allowed_hosts(message: &str, workspace: &str) -> Vec { return Vec::new(); } let workspace_path = std::path::PathBuf::from(workspace); - let skills = smooth_operator::skills::discover(&workspace_path); + let skills = smooth_cast::skills::discover(&workspace_path); let Some(skill) = skills.into_iter().find(|s| s.name == name) else { tracing::warn!(skill_name = name, "skill named in message header but not found in discovery — no pre-grant"); return Vec::new(); @@ -4570,7 +4570,7 @@ async fn post_chat_message_stream_handler( async fn auto_name_session(user_prompt: &str) -> Option { let providers_path = dirs_next::home_dir()?.join(".smooth/providers.json"); let registry = ProviderRegistry::load_from_file(&providers_path).ok()?; - let cast = smooth_operator::cast::Cast::builtin(); + let cast = smooth_cast::cast::builtin(); let agent = cast.get("tagger")?; let config = registry.llm_config_for(agent.slot).ok()?; let llm = smooth_operator::llm::LlmClient::new(config); diff --git a/crates/smooth-cast/Cargo.toml b/crates/smooth-cast/Cargo.toml new file mode 100644 index 00000000..b9a4e3e0 --- /dev/null +++ b/crates/smooth-cast/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "smooai-smooth-cast" +version = "0.13.7" +edition.workspace = true +license.workspace = true +repository.workspace = true +description = "Smooth coding-harness extensions to the smooth-operator engine — the th-code coding workflow, skill discovery, and the harness-specific cast roles (fixer/oracle/chief/intent_classifier) that the published generic engine no longer ships." + +[lib] +name = "smooth_cast" +path = "src/lib.rs" + +[dependencies] +# The published generic engine. smooth-cast re-homes the coding-harness +# bits the engine dropped at 0.14.0 and builds the custom cast roles on +# the engine's generic Cast/OperatorRole/Clearance public API. +smooth-operator.workspace = true + +anyhow.workspace = true +serde = { workspace = true } +serde_json.workspace = true +serde_yml.workspace = true +dirs-next.workspace = true +tokio.workspace = true +tracing.workspace = true + +[dev-dependencies] +tempfile.workspace = true + +[lints] +workspace = true diff --git a/crates/smooth-cast/builtin-skills/create-skill/SKILL.md b/crates/smooth-cast/builtin-skills/create-skill/SKILL.md new file mode 100644 index 00000000..a2ebac25 --- /dev/null +++ b/crates/smooth-cast/builtin-skills/create-skill/SKILL.md @@ -0,0 +1,109 @@ +--- +name: create-skill +description: Author a new skill (SKILL.md) for Smooth. Asks clarifying questions, drafts the frontmatter + body, writes the file to the user's chosen location, and offers a test invocation. +triggers: + - make a skill + - create a skill + - add a skill + - save this as a skill + - new skill + - author a skill +scope: host +allowed_tools: + - read_file + - write_file + - edit_file + - list_files + - bash +--- + +# create-skill + +The user wants to add a reusable recipe to their Smooth setup. Your job: turn a description of what the recipe should do into a well-formed `SKILL.md` file at the right path. + +## Process + +### 1. Clarify (if needed) + +If the user's request is vague — e.g. "make a skill for git stuff" — ask ONE question to narrow it: + +- "What should this skill do specifically? Concrete steps help." + +Skip clarifying if the request is concrete enough on its own ("make a skill that adds a movie to my smoo-hub watchlist using the api at smoo-hub:8787" — that's actionable). + +### 2. Decide scope: project or user + +Ask if you don't know: + +- **Project scope** (`/.smooth/skills//SKILL.md`) — the skill is tied to this codebase. Other workspaces don't see it. Commit it to the repo so teammates get it too. +- **User scope** (`~/.smooth/skills//SKILL.md`) — the skill applies to every Smooth dispatch you ever do, in any workspace. Personal. + +Default to user scope if the user just says "save it" without specifying. + +### 3. Pick a name + +Lowercase, hyphenated, descriptive. `add-show`, `format-rust`, `sync-to-s3`. The directory name and the `name:` frontmatter must match. + +### 4. Determine the scope: sandbox or host + +- `sandbox` (default) — the skill runs inside the microVM. Use when the skill only touches `/workspace`, runs build/test commands, edits source code, or needs nothing outside the sandbox. +- `host` — the skill bypasses the microVM and runs in Big Smooth's process directly. Use ONLY for genuine host-needing cases: `scp` to a local-network host, `sips` / macOS-specific tools, AWS SSO browser flows, Photos.app integration. + +**Network alone is NEVER a reason for `host`.** Network access from the sandbox is handled by `allowed_hosts` below. + +### 5. Determine `allowed_hosts` + +If the skill needs to reach a host the default Wonk policy doesn't allow (`llm.smoo.ai` is the only default), list those hosts here. Examples: + +- `smoo-hub` — LAN/tailscale-only personal server +- `api.tvmaze.com` — public API +- `*.azureedge.net` — wildcard for a CDN family + +Be specific. Don't list `*` or "all"; users won't accept that grant. + +### 6. Determine `allowed_tools` + +Optional. If left empty, the skill inherits the agent's full toolset. Use to RESTRICT (not expand) — e.g. a read-only summarize skill might say `allowed_tools: [read_file, list_files, grep]`. + +### 7. Write the SKILL.md body + +The body is what the agent reads when the skill is invoked. Make it: + +- **Short.** 30–80 lines for most skills. A long skill that the model has to wade through is worse than no skill. +- **Step-by-step.** Numbered list of what to do, in order. The model will follow it literally. +- **Concrete on commands.** Show the exact `curl`, `bash`, or tool invocation. Not "make an API call" but `curl -X POST http://smoo-hub:8787/api/shows -H 'Content-Type: application/json' -d '{...}'`. +- **Explicit on inputs.** Name what the user will provide (title, status, etc.) and what defaults you'll assume when they're missing. + +Optional sections worth including: + +- `## Inputs` — what the user typically provides +- `## Outputs` — what the user will see / what gets created +- `## Failure modes` — what to do when X is missing, Y returns 404, etc. + +### 8. Write the file + +For project scope: +```bash +mkdir -p .smooth/skills/ +# write SKILL.md +``` + +For user scope: +```bash +mkdir -p ~/.smooth/skills/ +# write SKILL.md +``` + +Then run `th skills list` to confirm the skill is discovered. + +### 9. (Optional) Test it + +If the user wants, offer to invoke the skill once with a sample input. Just suggest the invocation phrasing — don't auto-invoke unless they ask. + +## Output + +When done, reply with ONE sentence: + +> "Created `` at ``. Run `th skills show ` to inspect or invoke by saying something matching: ``." + +That's it. No essay. The diff is the artifact; the sentence confirms it landed. diff --git a/crates/smooth-cast/src/cast.rs b/crates/smooth-cast/src/cast.rs new file mode 100644 index 00000000..2388141d --- /dev/null +++ b/crates/smooth-cast/src/cast.rs @@ -0,0 +1,179 @@ +//! # Smooth cast roles — the coding-harness roles the generic engine dropped +//! +//! The published `smooai-smooth-operator-core` engine (0.14.0) ships a +//! *generic* [`Cast`](smooth_operator::cast::Cast) populated with the +//! generic roles (`tagger`, `presser`, `recapper`, `mapper`, `heckler`, +//! `scout`, `runner`). It deliberately dropped the four coding-harness +//! roles that only the `th code` workflow used: +//! +//! - **`fixer`** — the default `th` coding experience: full tool access, +//! `Coding`-slot routing. [`crate::coding_workflow`] looks up its prompt +//! + slot by name. +//! - **`oracle`** — pure read-only reasoning (no bash, no mutation). +//! - **`chief`** — the Chief-of-Staff router that emits `DISPATCH: `. +//! - **`intent_classifier`** — the chat TUI's `WORK`/`QUESTION` router. +//! +//! This module rebuilds those four roles on the engine's public +//! [`OperatorRole`]/[`Clearance`]/[`RoleKind`] API and exposes +//! [`builtin()`], a drop-in replacement for `Cast::builtin()` that returns +//! the generic engine roles PLUS these four. Smooth call sites that used to +//! call `smooth_operator::Cast::builtin()` and then `.get("fixer")` (etc.) +//! now call [`smooth_cast::cast::builtin()`](builtin) instead. + +use smooth_operator::cast::{Cast, Clearance, OperatorRole, RoleKind}; +use smooth_operator::providers::Activity; + +/// System prompt for the `fixer` role. Public because +/// [`crate::coding_workflow`] documents that it resolves the coding system +/// prompt from this role by name (mirrors the old engine's +/// `cast::FIXER_PROMPT`). +pub const FIXER_PROMPT: &str = include_str!("prompts/fixer.txt"); +const ORACLE_PROMPT: &str = include_str!("prompts/oracle.txt"); +const CHIEF_PROMPT: &str = include_str!("prompts/chief.txt"); +const INTENT_CLASSIFIER_PROMPT: &str = include_str!("prompts/intent_classifier.txt"); + +/// Read-only tool set used by reasoning roles (`oracle`). Anything not in +/// this list is denied. Mirrors the engine's private `read_only_tools()` +/// helper — kept here because the harness `oracle` role needs the same +/// allowlist and the engine no longer exposes it. +fn read_only_tools() -> Vec { + vec![ + "read_file".into(), + "list_files".into(), + "grep".into(), + "glob".into(), + "lsp".into(), + "project_inspect".into(), + // Memory is metadata, not source code — even read-only + // reasoning roles can persist what they learn about the + // workspace to .smooth/MEMORY.md so a later session + // doesn't have to re-discover everything. + "read_memory".into(), + "write_memory".into(), + ] +} + +/// The four coding-harness [`OperatorRole`]s the generic engine dropped. +fn smooth_roles() -> Vec { + vec![ + // `intent_classifier` is the chat TUI's auto-router: given a + // single user message, emit literal "WORK" or "QUESTION" so + // the dispatcher knows whether to run under fixer (coding + // workflow) or oracle (read-only Q&A). Routes through the + // Fast slot so it adds milliseconds, not seconds. + OperatorRole { + name: "intent_classifier".into(), + kind: RoleKind::Shadow, + slot: Activity::Fast, + model_override: None, + prompt: INTENT_CLASSIFIER_PROMPT.trim().to_string(), + permissions: Clearance::deny_all(), + steps: None, + hidden: true, + }, + // `chief` is the Chief of Staff router. Reads the user message + // and emits `DISPATCH: ` naming one of the lead/sidekick + // roles. Routes through the Fast slot so adding it costs + // milliseconds, not seconds. Falls back to the heuristic + // ladder when chief is unavailable (no providers, gateway + // down) so dispatch never hangs. + OperatorRole { + name: "chief".into(), + kind: RoleKind::Shadow, + slot: Activity::Fast, + model_override: None, + prompt: CHIEF_PROMPT.trim().to_string(), + permissions: Clearance::deny_all(), + steps: None, + hidden: true, + }, + // `fixer` is the default `th` experience: full tool access, + // Coding-slot routing. Its prompt is the coding system prompt + // that `coding_workflow` resolves by name. + OperatorRole { + name: "fixer".into(), + kind: RoleKind::Lead, + slot: Activity::Coding, + model_override: None, + prompt: FIXER_PROMPT.trim().to_string(), + permissions: Clearance::default(), + steps: None, + hidden: false, + }, + // `oracle` is pure reasoning — no bash, no mutation. + OperatorRole { + name: "oracle".into(), + kind: RoleKind::Lead, + slot: Activity::Reasoning, + model_override: None, + prompt: ORACLE_PROMPT.trim().to_string(), + permissions: Clearance { + allow_tools: read_only_tools(), + deny_tools: vec![ + "edit_file".into(), + "write_file".into(), + "apply_patch".into(), + "bash".into(), + "bg_run".into(), + "http_fetch".into(), + ], + }, + steps: None, + hidden: false, + }, + ] +} + +/// Build a [`Cast`] populated with the engine's generic built-in roles +/// (`tagger`, `presser`, `recapper`, `mapper`, `heckler`, `scout`, +/// `runner`) PLUS the four smooth coding-harness roles (`fixer`, `oracle`, +/// `chief`, `intent_classifier`). +/// +/// Drop-in replacement for `smooth_operator::Cast::builtin()` for smooth +/// call sites that depend on the coding-harness roles being present. +pub fn builtin() -> Cast { + let mut cast = Cast::builtin(); + for role in smooth_roles() { + cast.register(role); + } + cast +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builtin_registers_the_four_harness_roles() { + let cast = builtin(); + for name in ["fixer", "oracle", "chief", "intent_classifier"] { + assert!(cast.get(name).is_some(), "role '{name}' must be registered"); + } + } + + #[test] + fn builtin_keeps_the_generic_engine_roles() { + let cast = builtin(); + for name in ["tagger", "presser", "recapper", "mapper", "heckler", "scout", "runner"] { + assert!(cast.get(name).is_some(), "generic engine role '{name}' must survive"); + } + } + + #[test] + fn fixer_is_a_coding_lead_with_bash() { + let cast = builtin(); + let fixer = cast.get("fixer").expect("fixer registered"); + assert_eq!(fixer.kind, RoleKind::Lead); + assert!(matches!(fixer.slot, Activity::Coding)); + assert!(fixer.permissions.allows("bash"), "fixer must allow bash"); + } + + #[test] + fn oracle_is_read_only() { + let cast = builtin(); + let oracle = cast.get("oracle").expect("oracle registered"); + assert!(!oracle.permissions.allows("bash"), "oracle must deny bash"); + assert!(!oracle.permissions.allows("edit_file"), "oracle must deny edit_file"); + assert!(oracle.permissions.allows("read_file"), "oracle must allow read_file"); + } +} diff --git a/crates/smooth-cast/src/coding_workflow.rs b/crates/smooth-cast/src/coding_workflow.rs new file mode 100644 index 00000000..e7abda10 --- /dev/null +++ b/crates/smooth-cast/src/coding_workflow.rs @@ -0,0 +1,1958 @@ +//! Coding workflow — single-agent outer loop. +//! +//! The agent handles its own iteration (LLM → tool → LLM → …) +//! via `Agent::run_with_channel`. We sit around that and do three +//! things: +//! +//! 1. Snapshot the workspace when the failing-test count drops +//! — so a later turn can't regress past the best-seen state. +//! 2. On not-green, feed the test output back into the next +//! turn's prompt so the agent has surgical failure context. +//! 3. Stop when we're green, within a few failures of green +//! (more iteration is more likely to regress than improve), +//! over budget, or past the outer-iteration cap. +//! +//! We used to decompose into 7 phases (ASSESS / PLAN / EXECUTE / +//! VERIFY / REVIEW / TEST / FINALIZE). That added a lot of prompt +//! surface area and failure modes — the phase decomposition kept +//! silent-short-circuiting at one detector or another and eating +//! runs that should have kept going. A single-agent loop is +//! smaller, easier to reason about, and matches the shape of +//! tools like OpenCode that are maintained against coding +//! benchmarks. We kept the parts that demonstrably help — the +//! self-validation requirement in the system prompt, the +//! best-state snapshot, the compile-error short-circuit — and +//! dropped the per-phase dispatch. +//! +//! This module does NOT own the sandbox, the security hooks, or +//! the tool registry — the caller assembles those and hands them +//! in. + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::Context; +use tokio::sync::mpsc::UnboundedSender; + +use tokio::sync::mpsc::UnboundedReceiver; + +use crate::cast::builtin as cast_builtin; +use smooth_operator::agent::{Agent, AgentConfig, AgentEvent, InjectedMessage}; +use smooth_operator::cost::CostBudget; +use smooth_operator::providers::ProviderRegistry; +use smooth_operator::tool::ToolRegistry; + +/// Input to `run_coding_workflow`. +pub struct CodingWorkflowConfig { + /// Stable id for the operator running this workflow — echoed + /// into every AgentEvent. + pub operator_id: String, + /// The task prompt the user gave. + pub task_prompt: String, + /// Provider registry — used to resolve the Coding slot. + pub registry: Arc, + /// Tool registry the agent will use. + pub tools: ToolRegistry, + /// Optional global budget cap across the whole workflow. + pub budget_usd: Option, + /// Max outer-loop iterations. Each iteration is one full + /// `Agent::run_with_channel` call; the agent itself iterates + /// internally via tool calls. 5 is usually plenty — if the + /// agent can't converge in 5 full turns with failure context, + /// another turn is unlikely to help. + pub max_outer_iterations: u32, + /// Skip any post-implementation test-augmentation phase. + /// Kept in the config for API stability, currently ignored — + /// the single-agent loop doesn't have a separate TEST phase. + pub skip_test_phase: bool, + /// Event sink — every AgentEvent from the agent flows here. + pub tx: UnboundedSender, + /// Workspace root (bind-mounted at /workspace inside the + /// sandbox). Used to snapshot the best-seen state and restore + /// it on regression. `None` skips snapshotting. + pub workspace_root: Option, + /// Optional injection channel for mailbox messages — passed to every + /// inner Agent so steering/chat/answers from the lead reach a running + /// teammate without needing to restart the workflow. `None` keeps + /// the agent isolated (current behaviour for non-pearl-attached runs). + pub chat_rx: Option>>>, + /// Pearl th-e182bc: when the runner's caller detected cleanup + /// intent in the prior conversation (the README that started + /// the task), this carries that hint through to the workflow. + /// `build_user_prompt` uses it to apply the cleanup preamble + /// on CONTINUATION turns where the current `task_prompt` is a + /// bare confirmation ("yes, proceed") and would otherwise miss + /// the cleanup-intent detection. Pure additive: defaults false, + /// no behavior change for non-runner callers. + pub cleanup_intent_hint: bool, +} + +/// Run the workflow end-to-end. Returns the accumulated cost. +pub async fn run_coding_workflow(cfg: CodingWorkflowConfig) -> anyhow::Result { + // Pull the fixer role definition from the cast so the prompt + // lives in one place (`cast/prompts/fixer.txt`) and the slot + // comes from the role's `slot` field instead of being hard-coded + // here. The `fixer` role is always present in the smooth cast + // (`crate::cast::builtin()`) — the generic engine's `Cast::builtin()` + // dropped it at 0.14.0, so smooth re-homes it. If it ever isn't, + // something is badly wrong and we want a loud failure, not a silent + // fallback. + let cast = cast_builtin(); + let fixer_role = cast + .get("fixer") + .context("missing 'fixer' role in smooth cast — did smooth_cast::cast::builtin change?")?; + let code_prompt = fixer_role.prompt.clone(); + let code_slot = fixer_role.slot; + + let llm_config = cfg.registry.llm_config_for(code_slot).context("resolving coding slot → LLM config")?; + let coding_slot = cfg.registry.routing.slot_for(code_slot); + let alias = coding_slot.model.clone(); + + let mut total_cost_usd = 0.0_f64; + let mut total_prompt_tokens = 0u64; + let mut total_completion_tokens = 0u64; + let mut total_cached_tokens = 0u64; + let mut last_verify_output: Option = None; + let mut best_failed_count: Option = None; + let mut snapshot_taken = false; + // Pearl th-bench-loop iter 2: track NoEvidence retries. The + // agent's first turn often skips the test run entirely (saw + // 0 bash invocations in real bench runs). One retry with a + // forcing prompt that demands an explicit test invocation + // catches most of those before we give up. + let mut no_evidence_retries: u32 = 0; + const MAX_NO_EVIDENCE_RETRIES: u32 = 1; + + let iter_cap = cfg.max_outer_iterations.max(1); + let mut iteration = 0u32; + let mut succeeded = false; + + for _ in 0..iter_cap { + iteration += 1; + + let _ = cfg.tx.send(AgentEvent::PhaseStart { + phase: "CODING".into(), + alias: alias.clone(), + upstream: None, + iteration, + }); + + let user_prompt = build_user_prompt_with_hint(&cfg.task_prompt, iteration, last_verify_output.as_deref(), cfg.cleanup_intent_hint); + + // Inner iteration cap. Agent can take a lot of tool-call turns + // internally; default is 80 but `SMOOTH_WORKFLOW_AGENT_MAX_ITERATIONS` + // lets benchmark/diagnostic runs shorten the feedback loop. + let agent_max_iter: u32 = std::env::var("SMOOTH_WORKFLOW_AGENT_MAX_ITERATIONS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(80); + let mut agent_config = + AgentConfig::new(format!("{}/coding-{}", cfg.operator_id, iteration), code_prompt.clone(), llm_config.clone()).with_max_iterations(agent_max_iter); + if let Some(rx) = cfg.chat_rx.clone() { + agent_config = agent_config.with_chat_rx(rx); + } + if let Some(cap) = cfg.budget_usd { + let remaining = (cap - total_cost_usd).max(0.0); + agent_config = agent_config.with_budget(CostBudget { + max_cost_usd: Some(remaining), + max_tokens: None, + }); + } + + let agent = Agent::new(agent_config, cfg.tools.clone()); + let mut conversation = agent.run_with_channel(user_prompt, cfg.tx.clone()).await?; + + let (turn_cost, turn_prompt_tokens, turn_completion_tokens, turn_cached_tokens) = { + let tracker = agent.cost_tracker.lock().expect("cost_tracker lock"); + ( + tracker.total_cost_usd, + tracker.total_prompt_tokens, + tracker.total_completion_tokens, + tracker.total_cached_tokens, + ) + }; + total_cost_usd += turn_cost; + total_prompt_tokens += turn_prompt_tokens; + total_completion_tokens += turn_completion_tokens; + total_cached_tokens += turn_cached_tokens; + + // Pull the agent's final assistant message — used for + // failure-context feedback into the next turn's prompt. + let transcript = summarize_conversation(&conversation); + last_verify_output = Some(transcript.clone()); + + // Pearl th-7cf405 / th-ed7bfa: trust evidence, not claims. + // The assistant's prose can fabricate "31 passed, 0 failed" + // without ever running a test; only believe a tool-result + // message produced by `bash` / `test_run`. + let evidence = verify_with_evidence(&conversation); + + // Pearl th-bf62c0 / th-bench-loop iter 9: if the conversation + // contains a compile-error tool output AND we still have + // iterations to spend, force one more turn with the compile-fix + // preamble REGARDLESS of the evidence verdict. The + // `detect_compile_error` short-circuit in `build_user_prompt` + // only fires when the workflow loops back; with `EvidencedPass` + // or unhandled `NoEvidence` paths the loop can exit on iter 1 + // even though the agent shipped uncompilable code. Catch that + // here before any break. + if iteration < iter_cap { + if let Some(_err) = detect_compile_error(&transcript) { + tracing::info!(iteration, "coding workflow: compile error in transcript — forcing one more iteration"); + last_verify_output = Some(transcript.clone()); + continue; + } + // Also scan the actual tool-result messages for compile + // errors. The transcript above is just the final assistant + // prose; the cargo/go/javac output lives in the tool-result + // messages and is what we actually want to feed back. + if let Some(err_chunk) = first_compile_error_in_tools(&conversation) { + tracing::info!(iteration, "coding workflow: compile error in tool output — forcing one more iteration"); + last_verify_output = Some(err_chunk); + continue; + } + } + + match evidence { + VerifyEvidence::EvidencedPass => { + succeeded = true; + tracing::info!(iteration, "coding workflow: tool evidence shows green, stopping"); + break; + } + VerifyEvidence::EvidencedFail(_) => { + // Stay in the loop and feed failure context forward. + } + VerifyEvidence::NoEvidence => { + // No bash / test_run ever ran this turn. Three + // possibilities: + // 1. The task didn't require code at all — pure + // THINK mode ("how would you do X"). No edits, + // no tests, just an answer. + // 2. The agent edited files but skipped tests + // (the dominant benchmark-dispatch failure + // mode, pearl th-bench-loop iter 2). + // 3. The task required code but the model gave + // up before doing either. + // + // Retry-with-forcing-prompt only helps case (2). + // For case (1) the forcing prompt is a non-sequitur + // ("you edited but never ran tests") and surfaces + // as a confusing redaction notice to the user. So + // check: if the agent didn't edit ANYTHING this + // turn either, treat it as THINK mode and exit + // cleanly without the retry. + // + // Pearl th-fixer-think-mode (user 2026-05-10): + // "fixer always hallucinates tests, he should be a + // thinker too" — this is the workflow half of that + // fix; the prompt half lives in fixer.txt. + let made_edits = conversation_made_edits(&conversation); + let did_destructive_bash = conversation_did_destructive_bash(&conversation); + let cleanup_intent = is_cleanup_intent(&cfg.task_prompt); + if !made_edits && !did_destructive_bash { + // Pearl `th-e93cba`: if the user asked for cleanup + // / ops (delete X, prune Y, remove debris), skip + // the "this is a code task, write code" reprompt + // entirely. That reprompt was designed for code + // benchmarks (aider-polyglot etc.) and on cleanup + // tasks it triggered the agent to fabricate tests + // and pivot to test-fix narrative even when the + // user clearly asked for filesystem operations. + if cleanup_intent { + tracing::info!( + iteration, + "coding workflow: cleanup intent detected in user prompt, no agent actions yet — exiting cleanly without 'this is a code task' reprompt" + ); + break; + } + // Pearl th-fc8a51: on the FIRST iteration with no + // edits AND no test runs, retry once with a strong + // forcing prompt before falling back to THINK mode. + // The original "exit immediately as THINK" path was + // designed for chat questions, but for dispatched + // code tasks an agent that just read the + // INSTRUCTIONS.md and returned without coding is a + // give-up, not a thinker. cpp/bank-account hit this + // on bench sweep b32wx055q: 23s, $0.0001, 0 edits, + // FAIL — when the same task with the same model + // SOLVED 17/17 on a focused rerun. + if iteration == 1 && no_evidence_retries < MAX_NO_EVIDENCE_RETRIES { + no_evidence_retries += 1; + tracing::info!( + iteration, + retry = no_evidence_retries, + "coding workflow: no edits + no tests on iter 1 — forcing one retry before THINK-mode exit" + ); + last_verify_output = Some( + "Your previous turn made no edits to any source file. This is a code task — you need to actually implement the solution. Read the source files (the stub plus the test file), then use edit_file or bash to write the implementation, then run the project's test command via `bash`. Do not return until you've at least attempted both.".to_string(), + ); + continue; + } + tracing::info!( + iteration, + "coding workflow: no test-run evidence AND no edits — treating as THINK mode, exiting cleanly" + ); + break; + } + // Pearl `th-e93cba`: when the agent did destructive + // ops via `bash` (rm -rf, find -delete, etc.) but + // DIDN'T also edit source files, this was a cleanup + // task — `rm -rf __pycache__` doesn't need test + // verification. Exit cleanly instead of reprompting + // with "you didn't run tests", which made the agent + // fabricate test files and pivot to test-fix narrative + // on cleanup-pycache-debris and similar fixtures. + if did_destructive_bash && !made_edits { + tracing::info!( + iteration, + "coding workflow: destructive bash ops without source edits — cleanup task, exiting cleanly without test-forcing reprompt" + ); + break; + } + // Pearl `th-e93cba`: skip the "run the test suite" + // reprompt on cleanup-intent tasks too. Even when the + // agent makes incidental `edit_file` calls during a + // cleanup (e.g., updating a .gitignore), the workflow + // shouldn't force test runs that don't apply. + if cleanup_intent { + tracing::info!( + iteration, + "coding workflow: cleanup intent detected — exiting cleanly without 'run the test suite' reprompt" + ); + break; + } + if no_evidence_retries < MAX_NO_EVIDENCE_RETRIES { + no_evidence_retries += 1; + tracing::info!( + iteration, + retry = no_evidence_retries, + "coding workflow: no test-run evidence — re-prompting with forcing directive" + ); + last_verify_output = Some( + "Your previous turn edited the code but never ran the test suite. Before doing anything else this turn, run the project's test command via `bash` (cargo test / pytest / pnpm test / etc.) and report the actual output. The implementation is unverified until you do.".to_string(), + ); + continue; + } + tracing::info!(iteration, "coding workflow: no test-run evidence after retry, exiting"); + if detect_verify_pass(&transcript) { + // Pearl iter-10/11: the assistant claimed pass + // without evidence. Three actions: + // + // 1. tracing::warn for log retention. + // 2. [cast-summary] stderr line — surfaced + // by the runner stderr forward when + // /verbose is on. + // 3. APPEND a TokenDelta to the live event + // stream so the user sees the correction + // INLINE in the streamed chat. The + // streaming tokens already shipped — we + // can't unsay them — but we can append a + // correction the user sees alongside. + // 4. Mutate `conversation.messages` so saved + // sessions don't preserve the lie either. + tracing::warn!(iteration, "coding workflow: assistant claimed pass with NO tool evidence — likely hallucinated"); + eprintln!("[cast-summary] WARNING: assistant claimed test pass without evidence — no `bash` / `test_run` tool actually ran this turn."); + let correction = "\n\n---\n\n⚠️ **Correction:** the agent's `## Test Results` claim above is unverified — no `bash` / `test_run` tool actually ran this turn. The change above may be correct on its own merits but was not validated by the test suite. Run the tests yourself before trusting the result.\n"; + let _ = cfg.tx.send(AgentEvent::TokenDelta { content: correction.into() }); + redact_hallucinated_test_claims(&mut conversation); + } + break; + } + } + + // Snapshot the workspace when this turn was the best so + // far. If the agent never reports a count, we still snap + // the first turn so a later regression has something to + // restore to. + let current_failed = extract_failed_count(&transcript); + let improved = match (current_failed, best_failed_count) { + (Some(now), Some(best)) => now < best, + (Some(_), None) => true, + (None, _) if !snapshot_taken => true, // first turn, unknown count + _ => false, + }; + if improved { + if let Some(ref ws) = cfg.workspace_root { + match snapshot_workspace(ws, &best_snapshot_dir(ws)) { + Ok(()) => { + snapshot_taken = true; + if let Some(now) = current_failed { + best_failed_count = Some(now); + } + tracing::info!(iteration, failed = current_failed, "coding workflow: snapshotted best-seen workspace"); + } + Err(e) => tracing::warn!(error = %e, "coding workflow: snapshot failed"), + } + } + } + + // Close-to-green stop. When we've seen a turn at ≤3 failures + // and this turn didn't improve on it, another cycle is more + // likely to regress than close the gap. + if let Some(best) = best_failed_count { + if best <= CLOSE_TO_GREEN_THRESHOLD && !improved { + tracing::info!(iteration, best_failed = best, "coding workflow: close to green, stopping before regression"); + break; + } + } + + // Budget check: next turn would blow the cap. + if let Some(cap) = cfg.budget_usd { + if cap > 0.0 && total_cost_usd > 0.0 { + let per_iter = total_cost_usd / f64::from(iteration); + if total_cost_usd + per_iter >= cap { + tracing::info!(spent = total_cost_usd, cap, "coding workflow: budget exhausted"); + break; + } + } + } + } + + // Restore the best-seen workspace if a later turn regressed. + if !succeeded { + if let (Some(ref ws), Some(best), true) = (&cfg.workspace_root, best_failed_count, snapshot_taken) { + let final_failed = extract_failed_count(last_verify_output.as_deref().unwrap_or("")); + let regressed = final_failed.is_some_and(|n| n > best); + let snap = best_snapshot_dir(ws); + if regressed && snap.is_dir() { + match restore_workspace(&snap, ws) { + Ok(()) => tracing::info!(best_failed = best, "coding workflow: restored workspace to best-seen state"), + Err(e) => tracing::warn!(error = %e, "coding workflow: restore failed"), + } + } + } + } + + // Remove the snapshot so it doesn't leak into the scorer's + // view of the workspace or a follow-up run on the same dir. + if let Some(ref ws) = cfg.workspace_root { + let snap = best_snapshot_dir(ws); + if snap.is_dir() { + let _ = std::fs::remove_dir_all(&snap); + } + } + + let _ = cfg.tx.send(AgentEvent::Completed { + agent_id: cfg.operator_id.clone(), + iterations: iteration, + cost_usd: total_cost_usd, + prompt_tokens: total_prompt_tokens, + completion_tokens: total_completion_tokens, + cached_tokens: total_cached_tokens, + }); + + Ok(total_cost_usd) +} + +/// Stop escalating when we're this close to green — more +/// iteration is more likely to regress than close the gap. +const CLOSE_TO_GREEN_THRESHOLD: u32 = 3; + +// The coding system prompt lives in `crates/smooth-cast/src/prompts/fixer.txt` +// and is loaded by the smooth cast (`crate::cast::builtin()`) via `include_str!`. +// The workflow resolves it at the top of `run_coding_workflow` so adding a +// new prompt-aware role there gives all call sites the same text. + +/// Build the user-message prompt for a given outer iteration. +/// +/// Pearl iter-7 finding: the iteration-1 prompt used to ALWAYS append +/// "Implement the solution, run the test suite, and iterate until +/// green." That framing actively pushed the model toward green-field +/// implementation regardless of what the user actually asked. "Make +/// App.tsx better" became "Make App.tsx better // Implement the +/// solution // iterate until green" → agent rewrote the whole file, +/// added main.tsx, overwrote tsconfig.json. Same shape on "delete the +/// src directory" → agent deleted, then re-implemented. +/// +/// Now the iteration-1 prompt is the user's task verbatim. The fixer +/// system prompt already covers the "run the test suite before final +/// summary" discipline; we don't need to re-state it per turn at the +/// cost of confusing the model on non-test-driven tasks. +// Only the test suite calls the 3-arg convenience wrapper; the runtime path +// always goes through `build_user_prompt_with_hint` directly. +#[cfg(test)] +fn build_user_prompt(task: &str, iteration: u32, prior_output: Option<&str>) -> String { + build_user_prompt_with_hint(task, iteration, prior_output, false) +} + +#[allow(clippy::fn_params_excessive_bools)] // 1 bool + 1 u32 + 2 strs is fine +fn build_user_prompt_with_hint(task: &str, iteration: u32, prior_output: Option<&str>, cleanup_intent_hint: bool) -> String { + if iteration == 1 { + // Pearl th-e182bc: continuation-turn confirmation on a task + // the runner's caller flagged as cleanup-intent. Re-applies + // the (known-good) cleanup preamble so the agent doesn't + // pivot to test-fix or fabricate a wholly new task on + // turn 2. Cross-fixture confabulation root cause + // (e.g. `find -size +150k -delete` misfired on a + // node-modules orphan task) is the SAME failure mode + // [`is_cleanup_intent`] addresses on the planning turn. + if cleanup_intent_hint && is_confirmation_reply(task) { + return format!( + "[bench/workflow note: this is a FILESYSTEM CLEANUP task, not a code-fix or test-fix task. Do NOT write source files. Do NOT create test files. Do NOT run tests. The fixer system prompt's test-related guidance does NOT apply here.\n\nIgnore any source files (`*.py`, `*.rs`, `*.ts`, `main.*`, `lib.*`, etc.) you see in the workspace unless the user's request below explicitly mentions them — they are PROBABLY scope-discipline traps (files you must NOT delete), not invitations to start coding or running tests. Treat the user's request text as the sole source of truth for what to do.\n\nThe user is confirming a plan you enumerated in a PRIOR assistant turn — find that plan in the conversation history and execute it via `bash`. Pearl `th-e182bc`.]\n\n{task}" + ); + } + // Pearl `th-e93cba` round 2: when the user's prompt looks like + // a filesystem cleanup task, prepend an explicit context-setter. + // Without it, the model — even with the workflow-level + // intent-detection gate — would pattern-match on fixer.txt's + // heavy test-related guidance and fabricate a test-fix + // narrative ("I added a test file src/pkg/test_util.py and + // the tests passed") on a cleanup ask. The bare prompt isn't + // strong enough counter-pressure; this directly tells the + // model what kind of task this is and which fixer guidance + // doesn't apply. + if is_cleanup_intent(task) { + return format!( + "[bench/workflow note: this is a FILESYSTEM CLEANUP task, not a code-fix or test-fix task. Do NOT write source files. Do NOT create test files. Do NOT run tests. The fixer system prompt's test-related guidance does NOT apply here.\n\nIgnore any source files (`*.py`, `*.rs`, `*.ts`, `main.*`, `lib.*`, etc.) you see in the workspace unless the user's request below explicitly mentions them — they are PROBABLY scope-discipline traps (files you must NOT delete), not invitations to start coding or running tests. Treat the user's request text as the sole source of truth for what to do.\n\nJust discover the targets named in the user's request, enumerate them in your text response, ask for confirmation, then delete them via `bash` once approved. Pearl `th-81cd84`.]\n\n{task}" + ); + } + return task.to_string(); + } + let prior = prior_output.unwrap_or("(no prior output)"); + // Pearl th-bench-loop iter 2: the NoEvidence retry path + // injects a synthetic "you didn't run tests" message into + // prior_output. When we see that exact preamble, frame the + // next turn as a verification-only nudge instead of the + // standard fix-the-failures preamble — there were no + // failures captured because no test ever ran. + if prior.starts_with("Your previous turn edited the code but never ran the test suite.") { + return format!("{prior}\n\n## Task (reminder)\n\n{task}"); + } + let compile_err = detect_compile_error(prior); + let preamble = if let Some(err) = compile_err { + format!( + "Your previous attempt shipped code that does not compile / parse. Before doing anything else, fix the syntax. The usual cause is a duplicated class body or extra content appended after the module's export. \n\n## Compile error\n\n{err}\n\n" + ) + } else { + format!( + "Your previous attempt left some tests failing. The output from your last test run is below. Keep every test that's currently passing passing — most test regressions come from rewriting code that was working. Make a targeted patch that closes the specific failures.\n\n## Previous test output (truncated)\n\n{}\n\n", + prior.chars().take(3000).collect::() + ) + }; + format!("{preamble}## Task (reminder)\n\n{task}\n\nFix the remaining failures and re-run the tests. Finish with a `## Test Results` line.") +} + +// --------------------------------------------------------------------------- +// Helpers: test-result parsing, compile-error detection, snapshots. +// These are the same helpers the old multi-phase workflow used; +// they carry their own unit tests below and don't care whether +// the surrounding loop is one phase or seven. +// --------------------------------------------------------------------------- + +fn summarize_conversation(conv: &smooth_operator::conversation::Conversation) -> String { + conv.messages + .iter() + .rev() + .find(|m| matches!(m.role, smooth_operator::conversation::Role::Assistant)) + .map(|m| m.content.clone()) + .unwrap_or_default() +} + +/// What the evidence in the conversation says about this turn — +/// not what the assistant *claims*. Pearl th-7cf405 / th-ed7bfa: +/// the workflow used to trust the assistant's `## Test Results: 31 +/// passed, 0 failed` line verbatim, which made hallucinated +/// success indistinguishable from real success. We now require an +/// actual `bash` / `test_run` tool-result message in the +/// conversation whose output contains a recognizable test summary. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum VerifyEvidence { + /// A tool actually ran and reported a green test suite. + EvidencedPass, + /// A tool actually ran and reported failures. `Some(n)` if a + /// failure count was parseable; `None` if the output looked + /// red but didn't include a count we could extract. + EvidencedFail(Option), + /// No bash / test_run tool call ever happened in this turn. + /// The assistant either did nothing (silently passed text + /// back) or hallucinated a result it never observed. Both are + /// "no work was actually done" — caller decides whether to + /// retry or exit gracefully. + NoEvidence, +} + +/// Strip fabricated "X passed, Y failed" / "ALL TESTS PASS" +/// claims from the last assistant message and replace with an +/// honest annotation. Pearl iter-10: emitting a stderr WARNING +/// alone wasn't enough — the lie still appeared verbatim in the +/// chat, so users could miss the warning and trust the false +/// claim. This rewrites the message itself. +/// +/// Heuristic: look for the conventional `## Test Results` / +/// `Test Results` block at the end of the assistant prose and +/// replace its body. Also strip standalone count lines like +/// "31 passed, 0 failed" / "test result: ok. 5 passed; 0 failed". +pub fn redact_hallucinated_test_claims(conv: &mut smooth_operator::conversation::Conversation) { + // Find the last assistant message — that's where the user- + // visible final answer sits. + let Some(msg) = conv + .messages + .iter_mut() + .rev() + .find(|m| matches!(m.role, smooth_operator::conversation::Role::Assistant)) + else { + return; + }; + msg.content = redact_fabricated_test_results(&msg.content); +} + +/// String-only version of the redactor — pulled out for tests. +/// Pure function so the unit suite can pin every shape we know +/// the model produces. +#[must_use] +pub fn redact_fabricated_test_results(content: &str) -> String { + const NOTICE: &str = "⚠️ Test Results: NOT RUN — the agent did not actually execute the test suite this turn. The change above may be correct but is unverified. Run the tests yourself before trusting it."; + + // Strip "X passed, Y failed" / "X passed; Y failed" lines and + // replace the "## Test Results" block at the tail. Patterns: + // - "## Test Results\n\n31 passed, 0 failed" + // - "## Test Results\n\nALL TESTS PASS" + // - "Test Results: 31 passed, 0 failed" + // - bare "31 passed, 0 failed" line at end of content + let lines: Vec<&str> = content.lines().collect(); + let mut out: Vec = Vec::with_capacity(lines.len() + 2); + let mut in_test_results_block = false; + let mut redacted_block = false; + for line in &lines { + let trimmed = line.trim(); + // Heading variants. + let is_heading = + trimmed.eq_ignore_ascii_case("## test results") || trimmed.eq_ignore_ascii_case("test results") || trimmed.eq_ignore_ascii_case("# test results"); + if is_heading && !redacted_block { + in_test_results_block = true; + out.push(NOTICE.to_string()); + redacted_block = true; + continue; + } + if in_test_results_block { + // Continue swallowing lines until a new heading + // (`## ...`) starts a different section. + if trimmed.starts_with("## ") || trimmed.starts_with("# ") { + in_test_results_block = false; + out.push((*line).to_string()); + continue; + } + // Drop content inside the block. + continue; + } + // Bare "X passed, Y failed" / "X passed; Y failed" / "ALL TESTS PASS" lines. + let upper = trimmed.to_ascii_uppercase(); + let looks_like_count = (trimmed.contains("passed, ") || trimmed.contains("passed; ") || trimmed.contains("PASSED, ") || trimmed.contains("PASSED; ")) + && (trimmed.contains("failed") || trimmed.contains("FAILED")); + let looks_like_marker = upper.contains("ALL TESTS PASS") || upper == "TEST RESULT: OK"; + if looks_like_count || looks_like_marker { + // Replace with a one-line redaction marker. Append + // the full notice once if we haven't already (e.g. + // when there's no "## Test Results" heading). + if !redacted_block { + out.push(NOTICE.to_string()); + redacted_block = true; + } + continue; + } + out.push((*line).to_string()); + } + let result = out.join("\n"); + // Edge case: content didn't have a heading or count line + // pattern but still looked green to detect_verify_pass (rare; + // happens when the model uses idiomatic phrasing like "all + // tests pass" embedded in prose). In that case, append the + // notice at the end so the reader at least sees the warning. + if !redacted_block && detect_verify_pass(content) { + return format!("{result}\n\n{NOTICE}"); + } + result +} + +/// Inspect the conversation for tool-result evidence of test +/// outcomes. Walks tool-role messages in order and returns the +/// LAST shaped result — later tool runs win, since the agent +/// often runs the suite multiple times in one turn. +pub fn verify_with_evidence(conv: &smooth_operator::conversation::Conversation) -> VerifyEvidence { + let mut last_outcome = VerifyEvidence::NoEvidence; + for msg in &conv.messages { + if !matches!(msg.role, smooth_operator::conversation::Role::Tool) { + continue; + } + // Only test-shaped tools produce evidence we believe. + // `bash` is the catch-all (the agent runs `pnpm test` / + // `cargo test` / `pytest` through it). `test_run` is the + // workflow's structured test tool when present. Other + // tool outputs (read_file, list_files, grep) don't count + // even if the user happened to grep for "PASS" somewhere. + let name = msg.tool_name.as_deref().unwrap_or(""); + if name != "bash" && name != "test_run" && name != "shell" { + continue; + } + // Pearl th-bench-loop iter 13: "all tests skipped, 0 ran" + // is NOT a pass. Exercism JS uses `xtest()` and Java uses + // `@Disabled`; both default to skip and require the student + // to flip annotations as they implement. The agent ships + // implementations that look correct, the test runner returns + // 0 ran/0 failed (exit code 0), and the workflow used to + // call that a pass. It's not — nothing actually ran. + // + // Detect BEFORE detect_verify_pass since the skip-only + // output may also coincidentally match "0 failed" patterns. + if looks_all_skipped(&msg.content) { + last_outcome = VerifyEvidence::EvidencedFail(None); + continue; + } + if detect_verify_pass(&msg.content) { + last_outcome = VerifyEvidence::EvidencedPass; + continue; + } + // Look for explicit failure shapes. We reuse + // `nonzero_failure_count` so all the same patterns the + // pass-detection guards against count as fail signals. + let upper = msg.content.to_uppercase(); + let looks_red = + upper.contains("TEST RESULT: FAILED") || upper.contains("TESTS FAILED") || upper.contains("TESTS FAIL") || nonzero_failure_count(&upper); + if looks_red { + last_outcome = VerifyEvidence::EvidencedFail(extract_failed_count(&msg.content)); + } + // Otherwise leave last_outcome as-is — this tool call + // wasn't a test, or didn't produce a recognizable summary. + } + last_outcome +} + +/// True when test output indicates EVERY test was skipped — common +/// when an exercism framework defaults to `@Disabled` / `xtest()` / +/// `test.skip` and the student hasn't flipped them yet. Treat as +/// failure-no-evidence (pearl th-bench-loop iter 13): 0 tests +/// actually ran, the implementation is unverified. +/// +/// Heuristics (all case-insensitive on uppercase input): +/// - Jest: "Tests: N skipped, 0 passed, N total" +/// - Gradle/JUnit: "BUILD SUCCESSFUL" + "N tests completed, N skipped" +/// OR all "SKIPPED" markers with no "PASSED" / "FAILED" lines +/// - pytest: "N skipped" alongside "0 passed" +/// - go test: "ok ... [no tests to run]" (Go has no skip annotation +/// by default, but the no-tests case is the same problem) +pub fn looks_all_skipped(transcript: &str) -> bool { + let upper = transcript.to_uppercase(); + + // Gradle/JUnit: count of SKIPPED markers as inline test + // outcomes. Check FIRST because gradle lines don't have a + // numeric prefix the pytest-shape path would expect. + let skipped_lines = upper.lines().filter(|l| l.trim_end().ends_with("SKIPPED")).count(); + let pass_lines = upper.lines().filter(|l| l.trim_end().ends_with("PASSED")).count(); + let fail_lines = upper.lines().filter(|l| l.trim_end().ends_with("FAILED")).count(); + if skipped_lines >= 3 && pass_lines == 0 && fail_lines == 0 { + return true; + } + // Dominant-skip: per-line gradle/jest output where SKIPPED + // outnumbers PASSED 3-to-1 and no failures fired. Pearl + // th-bench-loop iter 15: js/forth produced "48 skipped, 1 + // passed" — the pure all-skipped check missed it because + // there was a single PASSED. Same root cause as iter 5 + // js/binary (9 skipped, 1 passed): exercism flips one + // baseline test as a sentinel, leaves the rest skipped. + if skipped_lines >= 3 * (pass_lines + 1) && fail_lines == 0 && pass_lines < skipped_lines { + return true; + } + + // Jest / pytest shape: explicit "N skipped, 0 passed". + if (upper.contains("0 PASSED") || upper.contains(" 0 PASSED,") || upper.contains(", 0 PASSED")) && upper.contains("SKIPPED") { + return true; + } + // Jest summary line: "N skipped, K passed, M total" where + // N >> K. Catches the summary-line variant we see in iter + // 15 ("Tests: 48 skipped, 1 passed, 49 total"). + if let Some((skip, pass)) = parse_jest_skip_pass(&upper) { + if skip >= 3 * (pass + 1) && pass < skip { + return true; + } + } + + // Go: "no tests to run" + ok status. + if upper.contains("[NO TESTS TO RUN]") { + return true; + } + + // Pytest: "N skipped" with no "passed" count at all. Last + // because the digit-prefix check is strict — wouldn't catch + // gradle's per-line shape, only pytest's summary count. + if upper.contains(" SKIPPED") && !upper.contains(" PASSED") && !upper.contains(" FAILED") { + return has_count_before(&upper, "SKIPPED"); + } + + false +} + +/// Parse a jest-style summary line `Tests: 48 skipped, 1 passed, +/// 49 total` into `(skipped, passed)`. Returns `None` when neither +/// count is present. +fn parse_jest_skip_pass(upper: &str) -> Option<(u32, u32)> { + let line = upper.lines().find(|l| l.contains("TESTS:") && l.contains("SKIPPED"))?; + let skip = scan_count(&line.to_lowercase(), "skipped")?; + let pass = scan_count(&line.to_lowercase(), "passed").unwrap_or(0); + Some((skip, pass)) +} + +/// True when `needle` is preceded by a digit (possibly with +/// whitespace) somewhere in `haystack`. Used by `looks_all_skipped` +/// to distinguish a count line (`10 SKIPPED`) from a comment +/// ("# this section is skipped"). +fn has_count_before(haystack: &str, needle: &str) -> bool { + let mut search = haystack; + while let Some(idx) = search.find(needle) { + let before = &search[..idx]; + let digits: String = before + .chars() + .rev() + .skip_while(|c| c.is_whitespace()) + .take_while(|c| c.is_ascii_digit()) + .collect::(); + if let Ok(n) = digits.chars().rev().collect::().parse::() { + if n > 0 { + return true; + } + } + search = &search[idx + needle.len()..]; + } + false +} + +/// True when the transcript reports the test suite is green. +/// Explicit prefix (`ALL TESTS PASS`) wins; runner-summary +/// fallbacks are narrow to avoid false positives on prose or +/// on Rust `Ok(..)` values that appear in failure diffs. +pub fn detect_verify_pass(transcript: &str) -> bool { + let upper = transcript.to_uppercase(); + if upper.contains("ALL TESTS PASS") { + return true; + } + if upper.contains("TESTS FAILED") || upper.contains("TESTS FAIL") { + return false; + } + if nonzero_failure_count(&upper) || upper.contains("TEST RESULT: FAILED") { + return false; + } + if upper.contains("TEST RESULT: OK") // cargo test + || upper.contains(" PASSED, 0 FAILED") // pytest / go / jest + || upper.contains("0 FAILED, 0 ERRORS") // go test verbose + || (upper.contains("TESTS:") && upper.contains(" PASSED") && upper.contains("0 FAILED")) + { + return true; + } + // pytest -q (quiet mode): output is just dots/letters then a + // terminal line like "15 passed in 0.05s" — no "failed" word + // at all. Earlier guards already rejected anything with a + // non-zero failure count, so seeing "N passed in