heurema · t3chn · May 25, 2026 · May 25, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -60,6 +60,9 @@ jobs:
       - name: Run hardening check
         run: make hardening-check
 
+      - name: Run local agent runner smoke
+        run: make run-smoke
+
       - name: Run leak check
         run: make leak-check
 

diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 PYTHON ?= python3
 PYTHONPATH ?= src
 
-.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check leak-check test
+.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check run-smoke leak-check test
 
 validate:
 	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli validate
@@ -67,6 +67,11 @@ mutation-smoke:
 hardening-check:
 	$(PYTHON) scripts/check_hardening_gates.py
 
+run-smoke:
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task IF-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/IF-01_case_001
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task DATA-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/DATA-01_case_001
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli compare --baseline runs/manual/mock --candidate runs/manual/mock --out reports/generated/compare_run_smoke.md
+
 leak-check:
 	$(PYTHON) scripts/public_leak_check.py .
 

diff --git a/README.md b/README.md
@@ -95,6 +95,7 @@ This repository is a **v0 public starter**. It contains:
 - minimal Python CLI scaffolding;
 - sample public fixtures;
 - sample scorers plus hardened IF-01, DATA-01, DOC-01, SUP-01, and API-01 artifact/state-based scorers;
+- a local command-based runner for external agent setups;
 - documentation for benchmark design, metrics, anti-overfitting, lifecycle status, hardening gates, and research radar process.
 
 It intentionally does **not** contain private holdout tasks, production secrets, personal data, or benchmark answers for real evaluation runs.
@@ -176,6 +177,24 @@ make leak-check
 
 The examples directory intentionally starts mostly empty. Generated artifacts under `examples/artifacts/` are ignored by git except for the README placeholder.
 
+## Run an external agent setup
+
+Use `agent-bench run` to hand an agent-visible task packet to any local command and score the artifacts it writes:
+
+```bash
+agent-bench run \
+  --task IF-01 \
+  --case case_001 \
+  --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \
+  --out runs/manual/mock/IF-01_case_001
+```
+
+The command receives `AGENT_BENCH_TASK_PACKET` and `AGENT_BENCH_ARTIFACTS_DIR`. It should write final artifacts to the artifacts directory. The runner then writes `run.json`, `trace.jsonl`, and `score.json`.
+
+The task packet excludes scorer-only files such as `check_config.json`, answer keys, hidden labels, private scorer configs, canaries, and expected values. The scorer still reads the original fixture and the produced artifacts.
+
+See [Local Agent Runner MVP](docs/21-local-agent-runner.md).
+
 ## Compare two agent setups
 
 Create two local smoke-run directories and compare them:
@@ -314,6 +333,7 @@ agent-bench-lab/
 - [DOC-01 decision-grade pattern](docs/13-doc01-decision-grade.md)
 - [SUP-01 decision-grade pattern](docs/14-sup01-decision-grade.md)
 - [API-01 decision-grade pattern](docs/15-api01-decision-grade.md)
+- [Local Agent Runner MVP](docs/21-local-agent-runner.md)
 - [Public release checklist](docs/public-release-checklist.md)
 - [v0 roadmap](docs/roadmap-v0.md)
 

diff --git a/docs/21-local-agent-runner.md b/docs/21-local-agent-runner.md
@@ -0,0 +1,140 @@
+# Local Agent Runner MVP
+
+The local runner lets Agent Bench Lab run an external command against an existing public task case.
+
+It is a command adapter, not a provider adapter.
+
+It does not implement OpenAI, Anthropic, MCP, browser automation, private bundle mounting, scheduled evals, or a sandbox. Any local agent setup can be wrapped as a command as long as it writes artifacts to the artifacts directory provided by the runner.
+
+## Command
+
+```bash
+agent-bench run \
+  --task IF-01 \
+  --case case_001 \
+  --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \
+  --out runs/manual/mock/IF-01_case_001
+```
+
+The command creates:
+
+```text
+runs/manual/mock/IF-01_case_001/
+  run.json
+  score.json
+  trace.jsonl
+  artifacts/
+  task_packet/
+```
+
+If `--out` is omitted, the runner writes under `runs/manual/...`. Local run outputs are ignored by git.
+
+## Environment
+
+The external command receives:
+
+```text
+AGENT_BENCH_TASK_ID
+AGENT_BENCH_CASE_ID
+AGENT_BENCH_RUN_ID
+AGENT_BENCH_TASK_PACKET
+AGENT_BENCH_ARTIFACTS_DIR
+AGENT_BENCH_AGENT_CONFIG
+```
+
+The agent command should write final artifacts to:
+
+```text
+$AGENT_BENCH_ARTIFACTS_DIR
+```
+
+## Visibility Boundary
+
+The runner keeps the agent-visible task packet separate from the scorer-visible fixture.
+
+The agent may see:
+
+- task prompt;
+- public task metadata;
+- public case spec;
+- safe public fixture inputs such as data, corpora, inbox files, API catalogs, policies, and state fixtures.
+
+The agent must not see:
+
+- `check_config.json`;
+- answer keys;
+- hidden labels;
+- private scorer configs;
+- canaries;
+- expected values;
+- private holdouts;
+- private eval bundle contents.
+
+The scorer still receives the original fixture directory and the produced artifact directory. This keeps public smoke runs aligned with the same visibility model needed for private holdouts.
+
+## Run and Score Records
+
+`run.json` records:
+
+- run id;
+- task and case id;
+- task version;
+- agent config id/hash;
+- command hash;
+- timing;
+- status;
+- score summary;
+- paths to task packet, artifacts, score, and trace.
+
+`trace.jsonl` records minimal runner lifecycle events:
+
+- `run_started`;
+- `task_packet_created`;
+- `agent_command_started`;
+- `agent_command_completed`;
+- `agent_command_timeout`;
+- `scorer_started`;
+- `scorer_completed`;
+- `run_completed`.
+
+Command output snippets are bounded and redacted. The runner does not store raw secrets or private scorer-only content.
+
+## Compare Setups
+
+Run two agent setups into separate directories, then compare the resulting `score.json` files:
+
+```bash
+agent-bench run \
+  --task DATA-01 \
+  --case case_001 \
+  --agent-cmd "python3 my_agent_a.py" \
+  --out runs/setup_a/DATA-01_case_001
+
+agent-bench run \
+  --task DATA-01 \
+  --case case_001 \
+  --agent-cmd "python3 my_agent_b.py" \
+  --out runs/setup_b/DATA-01_case_001
+
+agent-bench compare \
+  --baseline runs/setup_a \
+  --candidate runs/setup_b
+```
+
+## Smoke Agent
+
+`scripts/mock_agent_write_artifacts.py` is a test helper. It writes valid public sample artifacts for smoke cases into `$AGENT_BENCH_ARTIFACTS_DIR`.
+
+It is not a benchmarked agent and should not be used as evidence of agent capability.
+
+## Non-Goals
+
+The MVP intentionally does not provide:
+
+- provider-specific adapters;
+- live SaaS or MCP integrations;
+- browser or office workflow runners;
+- private holdout storage;
+- private bundle runtime;
+- automatic GitHub issues or commits;
+- scheduled evals.
diff --git a/docs/README.md b/docs/README.md
@@ -26,6 +26,7 @@ Start here:
 22. [Suite strategy](18-suite-strategy.md)
 23. [Report schema v1 guidance](19-report-schema-v1.md)
 24. [Research Radar](20-research-radar.md)
-25. [v0 roadmap](roadmap-v0.md)
-26. [Public release checklist](public-release-checklist.md)
-27. [Decision log template](decision-log-template.md)
+25. [Local Agent Runner MVP](21-local-agent-runner.md)
+26. [v0 roadmap](roadmap-v0.md)
+27. [Public release checklist](public-release-checklist.md)
+28. [Decision log template](decision-log-template.md)
diff --git a/schemas/run.schema.json b/schemas/run.schema.json
@@ -50,6 +50,7 @@
         "passed",
         "failed",
         "timeout",
+        "error",
         "invalid",
         "environment_error"
       ]

diff --git a/schemas/trace_event.schema.json b/schemas/trace_event.schema.json
@@ -23,6 +23,14 @@
         "tool_result",
         "artifact_write",
         "scorer_event",
+        "run_started",
+        "task_packet_created",
+        "agent_command_started",
+        "agent_command_completed",
+        "agent_command_timeout",
+        "scorer_started",
+        "scorer_completed",
+        "run_completed",
         "error"
       ]
     },

diff --git a/scripts/mock_agent_write_artifacts.py b/scripts/mock_agent_write_artifacts.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+import os
+import runpy
+import shutil
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+def copy_tree_contents(source: Path, destination: Path) -> None:
+    destination.mkdir(parents=True, exist_ok=True)
+    for path in sorted(source.rglob("*")):
+        if not path.is_file():
+            continue
+        target = destination / path.relative_to(source)
+        target.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(path, target)
+
+
+def main() -> int:
+    task_id = os.environ.get("AGENT_BENCH_TASK_ID")
+    case_id = os.environ.get("AGENT_BENCH_CASE_ID")
+    artifacts_dir = os.environ.get("AGENT_BENCH_ARTIFACTS_DIR")
+    if not task_id or not case_id or not artifacts_dir:
+        raise SystemExit("missing AGENT_BENCH_TASK_ID, AGENT_BENCH_CASE_ID, or AGENT_BENCH_ARTIFACTS_DIR")
+
+    runpy.run_path(str(ROOT / "scripts" / "create_sample_artifacts.py"), run_name="__main__")
+    source = ROOT / "examples" / "artifacts" / task_id / case_id
+    if not source.is_dir():
+        raise SystemExit(f"no sample artifacts for {task_id}/{case_id}")
+    copy_tree_contents(source, Path(artifacts_dir))
+    print(f"mock artifacts written for {task_id}/{case_id}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/agent_bench_lab/cli.py b/src/agent_bench_lab/cli.py
@@ -7,6 +7,7 @@
 
 from .compare import compare_score_dirs, render_markdown_report, write_csv_report
 from .registry import list_tasks, repo_root_from, validate_all
+from .runner import run_agent_task
 from .scoring import score_task, write_score
 
 
@@ -73,6 +74,37 @@ def cmd_compare(args: argparse.Namespace) -> int:
     return 1 if result["missing_scores"] else 0
 
 
+def cmd_run(args: argparse.Namespace) -> int:
+    root = repo_root_from(args.root)
+    agent_config = Path(args.agent_config).resolve() if args.agent_config else None
+    out_dir = Path(args.out).resolve() if args.out else None
+    try:
+        run_record = run_agent_task(
+            root=root,
+            task_id=args.task,
+            case_id=args.case,
+            agent_cmd=args.agent_cmd,
+            agent_config_path=agent_config,
+            out_dir=out_dir,
+            timeout_seconds=args.timeout,
+        )
+    except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 1
+
+    summary = {
+        "run_id": run_record["run_id"],
+        "task_id": run_record["task_id"],
+        "case_id": run_record["case_id"],
+        "status": run_record["status"],
+        "score": run_record.get("score"),
+        "success": run_record.get("success"),
+        "output_path": str(Path(run_record["paths"]["artifacts"]).parent),
+    }
+    print(json.dumps(summary, indent=2, ensure_ascii=False))
+    return 0 if run_record["status"] == "passed" else 2
+
+
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(prog="agent-bench")
     parser.add_argument("--root", default=".", help="Repository root")
@@ -100,6 +132,15 @@ def build_parser() -> argparse.ArgumentParser:
     p_compare.add_argument("--csv")
     p_compare.set_defaults(func=cmd_compare)
 
+    p_run = sub.add_parser("run", help="Run an external command against a task case")
+    p_run.add_argument("--task", required=True)
+    p_run.add_argument("--case", default="case_001")
+    p_run.add_argument("--agent-cmd", required=True)
+    p_run.add_argument("--agent-config")
+    p_run.add_argument("--out")
+    p_run.add_argument("--timeout", type=int, default=600)
+    p_run.set_defaults(func=cmd_run)
+
     return parser