From efa00d532ef91af8b161013248259794df37020c Mon Sep 17 00:00:00 2001
From: "Vitaly D." <netmin@pm.me>
Date: Mon, 25 May 2026 19:57:37 +0300
Subject: [PATCH] feat(runner): add local command runner MVP

Why:
- Agent Bench Lab needs a product-neutral way to run real external agent setups against existing task families, not only score prebuilt sample artifacts.
- The runner must preserve the benchmark visibility boundary by separating agent-visible task packets from scorer-visible fixtures.

What changed:
- Add `agent-bench run` and a local command runner that creates `task_packet/`, `artifacts/`, `run.json`, `trace.jsonl`, and `score.json`.
- Exclude scorer-only/private-looking files from task packets, including `check_config.json`, while scoring against the original fixture and produced artifacts.
- Add a mock smoke agent, runner tests, `make run-smoke`, CI wiring, schema event/status updates, and docs.

Testing:
- `make validate`
- `make test`
- `make smoke`
- `make compare-smoke`
- `make if01-smoke`
- `make data01-smoke`
- `make doc01-smoke`
- `make sup01-smoke`
- `make api01-smoke`
- `make lifecycle-check`
- `make mutation-smoke`
- `make hardening-check`
- `make run-smoke`
- `make leak-check`
- `python3 -m ruff check .`
- `git diff --check`
- tracked-file audit for private/generated/sensitive paths

Risk:
- moderate - command execution is intentionally local and generic; future provider/browser/MCP adapters should not bypass the task-packet visibility boundary.

Related: #22
---
 .github/workflows/ci.yml              |   3 +
 Makefile                              |   7 +-
 README.md                             |  20 ++
 docs/21-local-agent-runner.md         | 140 ++++++++++
 docs/README.md                        |   7 +-
 schemas/run.schema.json               |   1 +
 schemas/trace_event.schema.json       |   8 +
 scripts/mock_agent_write_artifacts.py |  38 +++
 src/agent_bench_lab/cli.py            |  41 +++
 src/agent_bench_lab/runner.py         | 376 ++++++++++++++++++++++++++
 tests/test_runner.py                  | 166 ++++++++++++
 11 files changed, 803 insertions(+), 4 deletions(-)
 create mode 100644 docs/21-local-agent-runner.md
 create mode 100644 scripts/mock_agent_write_artifacts.py
 create mode 100644 src/agent_bench_lab/runner.py
 create mode 100644 tests/test_runner.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b2c9064..4618f71 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -60,6 +60,9 @@ jobs:
       - name: Run hardening check
         run: make hardening-check
 
+      - name: Run local agent runner smoke
+        run: make run-smoke
+
       - name: Run leak check
         run: make leak-check
 
diff --git a/Makefile b/Makefile
index 80dcc7a..d7cc3ed 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 PYTHON ?= python3
 PYTHONPATH ?= src
 
-.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check leak-check test
+.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check run-smoke leak-check test
 
 validate:
 	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli validate
@@ -67,6 +67,11 @@ mutation-smoke:
 hardening-check:
 	$(PYTHON) scripts/check_hardening_gates.py
 
+run-smoke:
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task IF-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/IF-01_case_001
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task DATA-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/DATA-01_case_001
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli compare --baseline runs/manual/mock --candidate runs/manual/mock --out reports/generated/compare_run_smoke.md
+
 leak-check:
 	$(PYTHON) scripts/public_leak_check.py .
 
diff --git a/README.md b/README.md
index 6999e51..e3ebe33 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ This repository is a **v0 public starter**. It contains:
 - minimal Python CLI scaffolding;
 - sample public fixtures;
 - sample scorers plus hardened IF-01, DATA-01, DOC-01, SUP-01, and API-01 artifact/state-based scorers;
+- a local command-based runner for external agent setups;
 - documentation for benchmark design, metrics, anti-overfitting, lifecycle status, hardening gates, and research radar process.
 
 It intentionally does **not** contain private holdout tasks, production secrets, personal data, or benchmark answers for real evaluation runs.
@@ -176,6 +177,24 @@ make leak-check
 
 The examples directory intentionally starts mostly empty. Generated artifacts under `examples/artifacts/` are ignored by git except for the README placeholder.
 
+## Run an external agent setup
+
+Use `agent-bench run` to hand an agent-visible task packet to any local command and score the artifacts it writes:
+
+```bash
+agent-bench run \
+  --task IF-01 \
+  --case case_001 \
+  --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \
+  --out runs/manual/mock/IF-01_case_001
+```
+
+The command receives `AGENT_BENCH_TASK_PACKET` and `AGENT_BENCH_ARTIFACTS_DIR`. It should write final artifacts to the artifacts directory. The runner then writes `run.json`, `trace.jsonl`, and `score.json`.
+
+The task packet excludes scorer-only files such as `check_config.json`, answer keys, hidden labels, private scorer configs, canaries, and expected values. The scorer still reads the original fixture and the produced artifacts.
+
+See [Local Agent Runner MVP](docs/21-local-agent-runner.md).
+
 ## Compare two agent setups
 
 Create two local smoke-run directories and compare them:
@@ -314,6 +333,7 @@ agent-bench-lab/
 - [DOC-01 decision-grade pattern](docs/13-doc01-decision-grade.md)
 - [SUP-01 decision-grade pattern](docs/14-sup01-decision-grade.md)
 - [API-01 decision-grade pattern](docs/15-api01-decision-grade.md)
+- [Local Agent Runner MVP](docs/21-local-agent-runner.md)
 - [Public release checklist](docs/public-release-checklist.md)
 - [v0 roadmap](docs/roadmap-v0.md)
 
diff --git a/docs/21-local-agent-runner.md b/docs/21-local-agent-runner.md
new file mode 100644
index 0000000..11ff366
--- /dev/null
+++ b/docs/21-local-agent-runner.md
@@ -0,0 +1,140 @@
+# Local Agent Runner MVP
+
+The local runner lets Agent Bench Lab run an external command against an existing public task case.
+
+It is a command adapter, not a provider adapter.
+
+It does not implement OpenAI, Anthropic, MCP, browser automation, private bundle mounting, scheduled evals, or a sandbox. Any local agent setup can be wrapped as a command as long as it writes artifacts to the artifacts directory provided by the runner.
+
+## Command
+
+```bash
+agent-bench run \
+  --task IF-01 \
+  --case case_001 \
+  --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \
+  --out runs/manual/mock/IF-01_case_001
+```
+
+The command creates:
+
+```text
+runs/manual/mock/IF-01_case_001/
+  run.json
+  score.json
+  trace.jsonl
+  artifacts/
+  task_packet/
+```
+
+If `--out` is omitted, the runner writes under `runs/manual/...`. Local run outputs are ignored by git.
+
+## Environment
+
+The external command receives:
+
+```text
+AGENT_BENCH_TASK_ID
+AGENT_BENCH_CASE_ID
+AGENT_BENCH_RUN_ID
+AGENT_BENCH_TASK_PACKET
+AGENT_BENCH_ARTIFACTS_DIR
+AGENT_BENCH_AGENT_CONFIG
+```
+
+The agent command should write final artifacts to:
+
+```text
+$AGENT_BENCH_ARTIFACTS_DIR
+```
+
+## Visibility Boundary
+
+The runner keeps the agent-visible task packet separate from the scorer-visible fixture.
+
+The agent may see:
+
+- task prompt;
+- public task metadata;
+- public case spec;
+- safe public fixture inputs such as data, corpora, inbox files, API catalogs, policies, and state fixtures.
+
+The agent must not see:
+
+- `check_config.json`;
+- answer keys;
+- hidden labels;
+- private scorer configs;
+- canaries;
+- expected values;
+- private holdouts;
+- private eval bundle contents.
+
+The scorer still receives the original fixture directory and the produced artifact directory. This keeps public smoke runs aligned with the same visibility model needed for private holdouts.
+
+## Run and Score Records
+
+`run.json` records:
+
+- run id;
+- task and case id;
+- task version;
+- agent config id/hash;
+- command hash;
+- timing;
+- status;
+- score summary;
+- paths to task packet, artifacts, score, and trace.
+
+`trace.jsonl` records minimal runner lifecycle events:
+
+- `run_started`;
+- `task_packet_created`;
+- `agent_command_started`;
+- `agent_command_completed`;
+- `agent_command_timeout`;
+- `scorer_started`;
+- `scorer_completed`;
+- `run_completed`.
+
+Command output snippets are bounded and redacted. The runner does not store raw secrets or private scorer-only content.
+
+## Compare Setups
+
+Run two agent setups into separate directories, then compare the resulting `score.json` files:
+
+```bash
+agent-bench run \
+  --task DATA-01 \
+  --case case_001 \
+  --agent-cmd "python3 my_agent_a.py" \
+  --out runs/setup_a/DATA-01_case_001
+
+agent-bench run \
+  --task DATA-01 \
+  --case case_001 \
+  --agent-cmd "python3 my_agent_b.py" \
+  --out runs/setup_b/DATA-01_case_001
+
+agent-bench compare \
+  --baseline runs/setup_a \
+  --candidate runs/setup_b
+```
+
+## Smoke Agent
+
+`scripts/mock_agent_write_artifacts.py` is a test helper. It writes valid public sample artifacts for smoke cases into `$AGENT_BENCH_ARTIFACTS_DIR`.
+
+It is not a benchmarked agent and should not be used as evidence of agent capability.
+
+## Non-Goals
+
+The MVP intentionally does not provide:
+
+- provider-specific adapters;
+- live SaaS or MCP integrations;
+- browser or office workflow runners;
+- private holdout storage;
+- private bundle runtime;
+- automatic GitHub issues or commits;
+- scheduled evals.
diff --git a/docs/README.md b/docs/README.md
index 473b8de..8d2e64e 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -26,6 +26,7 @@ Start here:
 22. [Suite strategy](18-suite-strategy.md)
 23. [Report schema v1 guidance](19-report-schema-v1.md)
 24. [Research Radar](20-research-radar.md)
-25. [v0 roadmap](roadmap-v0.md)
-26. [Public release checklist](public-release-checklist.md)
-27. [Decision log template](decision-log-template.md)
+25. [Local Agent Runner MVP](21-local-agent-runner.md)
+26. [v0 roadmap](roadmap-v0.md)
+27. [Public release checklist](public-release-checklist.md)
+28. [Decision log template](decision-log-template.md)
diff --git a/schemas/run.schema.json b/schemas/run.schema.json
index bf481ac..a77ccf5 100644
--- a/schemas/run.schema.json
+++ b/schemas/run.schema.json
@@ -50,6 +50,7 @@
         "passed",
         "failed",
         "timeout",
+        "error",
         "invalid",
         "environment_error"
       ]
diff --git a/schemas/trace_event.schema.json b/schemas/trace_event.schema.json
index 8d03448..41e4208 100644
--- a/schemas/trace_event.schema.json
+++ b/schemas/trace_event.schema.json
@@ -23,6 +23,14 @@
         "tool_result",
         "artifact_write",
         "scorer_event",
+        "run_started",
+        "task_packet_created",
+        "agent_command_started",
+        "agent_command_completed",
+        "agent_command_timeout",
+        "scorer_started",
+        "scorer_completed",
+        "run_completed",
         "error"
       ]
     },
diff --git a/scripts/mock_agent_write_artifacts.py b/scripts/mock_agent_write_artifacts.py
new file mode 100644
index 0000000..610e02a
--- /dev/null
+++ b/scripts/mock_agent_write_artifacts.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+import os
+import runpy
+import shutil
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+def copy_tree_contents(source: Path, destination: Path) -> None:
+    destination.mkdir(parents=True, exist_ok=True)
+    for path in sorted(source.rglob("*")):
+        if not path.is_file():
+            continue
+        target = destination / path.relative_to(source)
+        target.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(path, target)
+
+
+def main() -> int:
+    task_id = os.environ.get("AGENT_BENCH_TASK_ID")
+    case_id = os.environ.get("AGENT_BENCH_CASE_ID")
+    artifacts_dir = os.environ.get("AGENT_BENCH_ARTIFACTS_DIR")
+    if not task_id or not case_id or not artifacts_dir:
+        raise SystemExit("missing AGENT_BENCH_TASK_ID, AGENT_BENCH_CASE_ID, or AGENT_BENCH_ARTIFACTS_DIR")
+
+    runpy.run_path(str(ROOT / "scripts" / "create_sample_artifacts.py"), run_name="__main__")
+    source = ROOT / "examples" / "artifacts" / task_id / case_id
+    if not source.is_dir():
+        raise SystemExit(f"no sample artifacts for {task_id}/{case_id}")
+    copy_tree_contents(source, Path(artifacts_dir))
+    print(f"mock artifacts written for {task_id}/{case_id}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/agent_bench_lab/cli.py b/src/agent_bench_lab/cli.py
index 7051cca..ed3e274 100644
--- a/src/agent_bench_lab/cli.py
+++ b/src/agent_bench_lab/cli.py
@@ -7,6 +7,7 @@
 
 from .compare import compare_score_dirs, render_markdown_report, write_csv_report
 from .registry import list_tasks, repo_root_from, validate_all
+from .runner import run_agent_task
 from .scoring import score_task, write_score
 
 
@@ -73,6 +74,37 @@ def cmd_compare(args: argparse.Namespace) -> int:
     return 1 if result["missing_scores"] else 0
 
 
+def cmd_run(args: argparse.Namespace) -> int:
+    root = repo_root_from(args.root)
+    agent_config = Path(args.agent_config).resolve() if args.agent_config else None
+    out_dir = Path(args.out).resolve() if args.out else None
+    try:
+        run_record = run_agent_task(
+            root=root,
+            task_id=args.task,
+            case_id=args.case,
+            agent_cmd=args.agent_cmd,
+            agent_config_path=agent_config,
+            out_dir=out_dir,
+            timeout_seconds=args.timeout,
+        )
+    except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 1
+
+    summary = {
+        "run_id": run_record["run_id"],
+        "task_id": run_record["task_id"],
+        "case_id": run_record["case_id"],
+        "status": run_record["status"],
+        "score": run_record.get("score"),
+        "success": run_record.get("success"),
+        "output_path": str(Path(run_record["paths"]["artifacts"]).parent),
+    }
+    print(json.dumps(summary, indent=2, ensure_ascii=False))
+    return 0 if run_record["status"] == "passed" else 2
+
+
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(prog="agent-bench")
     parser.add_argument("--root", default=".", help="Repository root")
@@ -100,6 +132,15 @@ def build_parser() -> argparse.ArgumentParser:
     p_compare.add_argument("--csv")
     p_compare.set_defaults(func=cmd_compare)
 
+    p_run = sub.add_parser("run", help="Run an external command against a task case")
+    p_run.add_argument("--task", required=True)
+    p_run.add_argument("--case", default="case_001")
+    p_run.add_argument("--agent-cmd", required=True)
+    p_run.add_argument("--agent-config")
+    p_run.add_argument("--out")
+    p_run.add_argument("--timeout", type=int, default=600)
+    p_run.set_defaults(func=cmd_run)
+
     return parser
 
 
diff --git a/src/agent_bench_lab/runner.py b/src/agent_bench_lab/runner.py
new file mode 100644
index 0000000..563dfdd
--- /dev/null
+++ b/src/agent_bench_lab/runner.py
@@ -0,0 +1,376 @@
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+from datetime import UTC, datetime
+from hashlib import sha256
+from pathlib import Path
+from typing import Any
+
+from .redaction import redact_text
+from .registry import load_task, repo_root_from
+from .run_records import load_agent_config, load_task_version
+from .scoring import score_task, write_score
+
+RUN_STATUSES = {"passed", "failed", "timeout", "error"}
+TRACE_SNIPPET_CHARS = 2000
+AGENT_VISIBLE_TASK_FILES = ("prompt.md", "task.json")
+SCORER_ONLY_FILENAMES = {
+    "check_config.json",
+}
+SCORER_ONLY_PATTERNS = (
+    "answer_key",
+    "hidden_label",
+    "private",
+    "canary",
+    "scorer_config",
+    "expected",
+    "rubric_private",
+)
+
+
+def utc_now() -> str:
+    return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def command_hash(agent_cmd: str) -> str:
+    return sha256(agent_cmd.encode("utf-8")).hexdigest()
+
+
+def safe_slug(value: str) -> str:
+    return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in value)
+
+
+def safe_snippet(text: str | None, limit: int = TRACE_SNIPPET_CHARS) -> str:
+    if not text:
+        return ""
+    return redact_text(text[:limit])
+
+
+def is_agent_visible_path(path: Path) -> bool:
+    if path.name.lower() in SCORER_ONLY_FILENAMES:
+        return False
+    lowered = path.as_posix().lower()
+    return not any(pattern in lowered for pattern in SCORER_ONLY_PATTERNS)
+
+
+def copy_public_file(source: Path, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(source, destination)
+
+
+def write_json(path: Path, data: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+
+def append_trace_event(
+    trace_path: Path,
+    *,
+    run_id: str,
+    event_type: str,
+    actor: str,
+    metadata: dict[str, Any] | None = None,
+) -> None:
+    trace_path.parent.mkdir(parents=True, exist_ok=True)
+    event = {
+        "ts": utc_now(),
+        "run_id": run_id,
+        "event_type": event_type,
+        "actor": actor,
+        "metadata": metadata or {},
+    }
+    with trace_path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(event, ensure_ascii=False) + "\n")
+
+
+def create_task_packet(
+    *,
+    root: Path,
+    task_id: str,
+    case_id: str,
+    task_dir: Path,
+    fixture_dir: Path,
+    packet_dir: Path,
+) -> dict[str, Any]:
+    if packet_dir.exists():
+        shutil.rmtree(packet_dir)
+    packet_dir.mkdir(parents=True)
+
+    included: list[str] = []
+    excluded_count = 0
+
+    for filename in AGENT_VISIBLE_TASK_FILES:
+        source = task_dir / filename
+        if not source.exists():
+            continue
+        destination = packet_dir / filename
+        copy_public_file(source, destination)
+        included.append(destination.relative_to(packet_dir).as_posix())
+
+    for source in sorted(fixture_dir.rglob("*")):
+        if not source.is_file() or source.is_symlink():
+            continue
+        rel_fixture_path = source.relative_to(fixture_dir)
+        if not is_agent_visible_path(rel_fixture_path):
+            excluded_count += 1
+            continue
+        destination = packet_dir / "fixture" / rel_fixture_path
+        copy_public_file(source, destination)
+        included.append(destination.relative_to(packet_dir).as_posix())
+
+    prompt_source = task_dir / "prompt.md"
+    spec_source = fixture_dir / "spec.md"
+    prompt_parts = []
+    if prompt_source.exists():
+        prompt_parts.append(prompt_source.read_text(encoding="utf-8"))
+    if spec_source.exists() and is_agent_visible_path(Path("spec.md")):
+        prompt_parts.extend(["", "## Case Spec", "", spec_source.read_text(encoding="utf-8")])
+    task_prompt = "\n".join(prompt_parts).strip() + "\n"
+    (packet_dir / "task_prompt.md").write_text(task_prompt, encoding="utf-8")
+    included.append("task_prompt.md")
+
+    readme = (
+        "# Agent Bench Task Packet\n\n"
+        "This directory contains only agent-visible task instructions and public fixture inputs.\n"
+        "Write final artifacts to the path in `AGENT_BENCH_ARTIFACTS_DIR`.\n\n"
+        "Scorer-only files such as `check_config.json`, hidden labels, answer keys, private scorer "
+        "configs, canaries, and expected values are intentionally excluded.\n"
+    )
+    (packet_dir / "README.md").write_text(readme, encoding="utf-8")
+    included.append("README.md")
+
+    manifest = {
+        "task_id": task_id,
+        "case_id": case_id,
+        "source_fixture": (fixture_dir.relative_to(root)).as_posix(),
+        "included_files": sorted(set(included)),
+        "excluded_file_count": excluded_count,
+        "excluded_reason": "scorer_or_private_visibility_boundary",
+        "visibility": "agent",
+        "scorer_only_files_excluded": True,
+    }
+    write_json(packet_dir / "manifest.json", manifest)
+    return manifest
+
+
+def default_out_dir(root: Path, agent_config_id: str, task_id: str, case_id: str, run_id: str) -> Path:
+    return root / "runs" / "manual" / safe_slug(agent_config_id) / f"{task_id}_{case_id}_{run_id}"
+
+
+def build_run_id(agent_config_id: str, task_id: str, case_id: str) -> str:
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    return safe_slug(f"{agent_config_id}_{task_id}_{case_id}_{timestamp}")
+
+
+def run_agent_task(
+    task_id: str,
+    case_id: str,
+    agent_cmd: str,
+    agent_config_path: Path | None,
+    out_dir: Path | None = None,
+    timeout_seconds: int = 600,
+    root: Path | None = None,
+) -> dict[str, Any]:
+    root = repo_root_from(root)
+    task_dir = root / "tasks" / task_id
+    fixture_dir = root / "fixtures" / "public" / task_id / case_id
+    if not task_dir.is_dir():
+        raise FileNotFoundError(f"Unknown task: {task_id}")
+    if not fixture_dir.is_dir():
+        raise FileNotFoundError(f"Missing public fixture for {task_id}/{case_id}: {fixture_dir}")
+
+    task = load_task(task_dir)
+    agent_config_id, agent_config_hash = load_agent_config(agent_config_path)
+    run_id = build_run_id(agent_config_id, task_id, case_id)
+    resolved_out_dir = out_dir.resolve() if out_dir else default_out_dir(
+        root, agent_config_id, task_id, case_id, run_id
+    )
+    resolved_out_dir.mkdir(parents=True, exist_ok=True)
+
+    task_packet_dir = resolved_out_dir / "task_packet"
+    artifacts_dir = resolved_out_dir / "artifacts"
+    score_path = resolved_out_dir / "score.json"
+    run_path = resolved_out_dir / "run.json"
+    trace_path = resolved_out_dir / "trace.jsonl"
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+    if trace_path.exists():
+        trace_path.unlink()
+
+    started_at = utc_now()
+    append_trace_event(trace_path, run_id=run_id, event_type="run_started", actor="runner")
+    manifest = create_task_packet(
+        root=root,
+        task_id=task_id,
+        case_id=case_id,
+        task_dir=task_dir,
+        fixture_dir=fixture_dir,
+        packet_dir=task_packet_dir,
+    )
+    append_trace_event(
+        trace_path,
+        run_id=run_id,
+        event_type="task_packet_created",
+        actor="runner",
+        metadata={"included_files": len(manifest["included_files"])},
+    )
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "AGENT_BENCH_TASK_ID": task_id,
+            "AGENT_BENCH_CASE_ID": case_id,
+            "AGENT_BENCH_RUN_ID": run_id,
+            "AGENT_BENCH_TASK_PACKET": str(task_packet_dir),
+            "AGENT_BENCH_ARTIFACTS_DIR": str(artifacts_dir),
+            "AGENT_BENCH_AGENT_CONFIG": str(agent_config_path or ""),
+        }
+    )
+
+    returncode: int | None = None
+    stdout_snippet = ""
+    stderr_snippet = ""
+    status = "failed"
+    append_trace_event(
+        trace_path,
+        run_id=run_id,
+        event_type="agent_command_started",
+        actor="agent",
+        metadata={"agent_cmd_hash": command_hash(agent_cmd)},
+    )
+    try:
+        completed = subprocess.run(
+            agent_cmd,
+            shell=True,
+            cwd=root,
+            env=env,
+            text=True,
+            capture_output=True,
+            timeout=timeout_seconds,
+            check=False,
+        )
+        returncode = completed.returncode
+        stdout_snippet = safe_snippet(completed.stdout)
+        stderr_snippet = safe_snippet(completed.stderr)
+        append_trace_event(
+            trace_path,
+            run_id=run_id,
+            event_type="agent_command_completed",
+            actor="agent",
+            metadata={
+                "returncode": returncode,
+                "stdout_snippet": stdout_snippet,
+                "stderr_snippet": stderr_snippet,
+            },
+        )
+    except subprocess.TimeoutExpired as exc:
+        status = "timeout"
+        stdout = (
+            exc.stdout.decode("utf-8", errors="replace")
+            if isinstance(exc.stdout, bytes)
+            else exc.stdout
+        )
+        stderr = (
+            exc.stderr.decode("utf-8", errors="replace")
+            if isinstance(exc.stderr, bytes)
+            else exc.stderr
+        )
+        stdout_snippet = safe_snippet(stdout)
+        stderr_snippet = safe_snippet(stderr)
+        append_trace_event(
+            trace_path,
+            run_id=run_id,
+            event_type="agent_command_timeout",
+            actor="agent",
+            metadata={
+                "timeout_seconds": timeout_seconds,
+                "stdout_snippet": stdout_snippet,
+                "stderr_snippet": stderr_snippet,
+            },
+        )
+
+    append_trace_event(trace_path, run_id=run_id, event_type="scorer_started", actor="scorer")
+    try:
+        score = score_task(
+            root=root,
+            task_id=task_id,
+            case_id=case_id,
+            artifacts_dir=artifacts_dir,
+            agent_config_path=agent_config_path,
+            run_id=run_id,
+        )
+    except Exception as exc:  # noqa: BLE001 - runner must preserve local failure records.
+        score = {
+            "run_id": run_id,
+            "task_id": task_id,
+            "case_id": case_id,
+            "task_version": load_task_version(task_dir),
+            "scorer_version": "error",
+            "agent_config_id": agent_config_id,
+            "agent_config_hash": agent_config_hash,
+            "success": False,
+            "score": 0.0,
+            "pass_threshold": 0.8,
+            "components": {},
+            "policy_violations": [],
+            "errors": [redact_text(str(exc))],
+            "artifact_hashes": {},
+            "metadata": {
+                "latency_seconds": None,
+                "cost_usd": None,
+                "tool_calls": None,
+                "model_calls": None,
+                "notes": None,
+            },
+        }
+    write_score(score, score_path)
+    append_trace_event(
+        trace_path,
+        run_id=run_id,
+        event_type="scorer_completed",
+        actor="scorer",
+        metadata={"score": score.get("score"), "success": score.get("success")},
+    )
+
+    if status != "timeout":
+        status = "passed" if returncode == 0 and score.get("success") else "failed"
+    completed_at = utc_now()
+    run_record = {
+        "run_id": run_id,
+        "task_id": task_id,
+        "case_id": case_id,
+        "task_version": load_task_version(task_dir),
+        "task_name": task.get("name"),
+        "agent_config_id": agent_config_id,
+        "agent_config_hash": agent_config_hash,
+        "agent_cmd_hash": command_hash(agent_cmd),
+        "started_at": started_at,
+        "completed_at": completed_at,
+        "status": status,
+        "success": bool(score.get("success")) and status == "passed",
+        "score": score.get("score"),
+        "timeout_seconds": timeout_seconds,
+        "paths": {
+            "task_packet": str(task_packet_dir),
+            "artifacts": str(artifacts_dir),
+            "score": str(score_path),
+            "trace": str(trace_path),
+        },
+        "command": {
+            "returncode": returncode,
+            "agent_cmd_redacted": redact_text(agent_cmd),
+            "stdout_snippet": stdout_snippet,
+            "stderr_snippet": stderr_snippet,
+        },
+    }
+    write_json(run_path, run_record)
+    append_trace_event(
+        trace_path,
+        run_id=run_id,
+        event_type="run_completed",
+        actor="runner",
+        metadata={"status": status, "score": score.get("score"), "success": run_record["success"]},
+    )
+    return run_record
diff --git a/tests/test_runner.py b/tests/test_runner.py
new file mode 100644
index 0000000..48a9d2e
--- /dev/null
+++ b/tests/test_runner.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+from agent_bench_lab.cli import main as cli_main
+from agent_bench_lab.runner import create_task_packet, run_agent_task
+
+
+def root_dir() -> Path:
+    return Path(__file__).resolve().parents[1]
+
+
+def test_task_packet_excludes_check_config(tmp_path):
+    out_dir = tmp_path / "packet_run"
+    run_record = run_agent_task(
+        root=root_dir(),
+        task_id="IF-01",
+        case_id="case_001",
+        agent_cmd=f"{sys.executable} -c \"pass\"",
+        agent_config_path=None,
+        out_dir=out_dir,
+        timeout_seconds=30,
+    )
+    packet_dir = Path(run_record["paths"]["task_packet"])
+
+    assert (packet_dir / "fixture" / "spec.md").exists()
+    assert not (packet_dir / "fixture" / "check_config.json").exists()
+    assert "check_config.json" not in (packet_dir / "manifest.json").read_text(encoding="utf-8")
+
+
+def test_task_packet_excludes_denylisted_scorer_only_files(tmp_path):
+    task_dir = tmp_path / "tasks" / "T-01"
+    fixture_dir = tmp_path / "fixtures" / "public" / "T-01" / "case_001"
+    packet_dir = tmp_path / "packet"
+    task_dir.mkdir(parents=True)
+    fixture_dir.mkdir(parents=True)
+    (task_dir / "prompt.md").write_text("Prompt", encoding="utf-8")
+    (task_dir / "task.json").write_text("{}", encoding="utf-8")
+    (fixture_dir / "spec.md").write_text("Spec", encoding="utf-8")
+    (fixture_dir / "answer_key.json").write_text("{}", encoding="utf-8")
+    (fixture_dir / "hidden_label.txt").write_text("label", encoding="utf-8")
+    (fixture_dir / "data.csv").write_text("id,value\n1,2\n", encoding="utf-8")
+
+    manifest = create_task_packet(
+        root=tmp_path,
+        task_id="T-01",
+        case_id="case_001",
+        task_dir=task_dir,
+        fixture_dir=fixture_dir,
+        packet_dir=packet_dir,
+    )
+
+    assert (packet_dir / "fixture" / "data.csv").exists()
+    assert not (packet_dir / "fixture" / "answer_key.json").exists()
+    assert not (packet_dir / "fixture" / "hidden_label.txt").exists()
+    assert manifest["excluded_file_count"] == 2
+
+
+def test_runner_writes_run_trace_and_score_with_mock_agent(tmp_path):
+    out_dir = tmp_path / "run"
+    run_record = run_agent_task(
+        root=root_dir(),
+        task_id="IF-01",
+        case_id="case_001",
+        agent_cmd=f"{sys.executable} scripts/mock_agent_write_artifacts.py",
+        agent_config_path=None,
+        out_dir=out_dir,
+        timeout_seconds=60,
+    )
+
+    assert run_record["status"] == "passed"
+    assert (out_dir / "run.json").exists()
+    assert (out_dir / "trace.jsonl").exists()
+    assert (out_dir / "score.json").exists()
+    assert (out_dir / "artifacts" / "artifact.md").exists()
+    score = json.loads((out_dir / "score.json").read_text(encoding="utf-8"))
+    assert score["success"]
+
+
+def test_runner_handles_agent_command_timeout(tmp_path):
+    out_dir = tmp_path / "timeout"
+    run_record = run_agent_task(
+        root=root_dir(),
+        task_id="IF-01",
+        case_id="case_001",
+        agent_cmd=f"{sys.executable} -c \"import time; time.sleep(2)\"",
+        agent_config_path=None,
+        out_dir=out_dir,
+        timeout_seconds=1,
+    )
+
+    assert run_record["status"] == "timeout"
+    assert (out_dir / "run.json").exists()
+    assert (out_dir / "score.json").exists()
+    assert "agent_command_timeout" in (out_dir / "trace.jsonl").read_text(encoding="utf-8")
+
+
+def test_runner_handles_missing_artifacts_gracefully(tmp_path):
+    out_dir = tmp_path / "missing_artifacts"
+    run_record = run_agent_task(
+        root=root_dir(),
+        task_id="IF-01",
+        case_id="case_001",
+        agent_cmd=f"{sys.executable} -c \"print('no artifacts')\"",
+        agent_config_path=None,
+        out_dir=out_dir,
+        timeout_seconds=30,
+    )
+
+    assert run_record["status"] == "failed"
+    score = json.loads((out_dir / "score.json").read_text(encoding="utf-8"))
+    assert not score["success"]
+    assert score["score"] < score["pass_threshold"]
+
+
+def test_runner_redacts_unsafe_stdout_and_stderr(tmp_path):
+    out_dir = tmp_path / "redacted"
+    command = (
+        f"{sys.executable} -c \"import sys; "
+        "print('correct answer was x expected=y CANARY_123'); "
+        "print('api_key=secret', file=sys.stderr)\""
+    )
+    run_record = run_agent_task(
+        root=root_dir(),
+        task_id="IF-01",
+        case_id="case_001",
+        agent_cmd=command,
+        agent_config_path=None,
+        out_dir=out_dir,
+        timeout_seconds=30,
+    )
+    trace_text = (out_dir / "trace.jsonl").read_text(encoding="utf-8")
+    run_text = json.dumps(run_record)
+
+    assert "[REDACTED]" in trace_text
+    assert "CANARY_123" not in trace_text
+    assert "api_key" not in trace_text
+    assert "CANARY_123" not in run_text
+    assert "api_key" not in run_text
+
+
+def test_cli_agent_bench_run_works_with_mock_agent(tmp_path, capsys):
+    out_dir = tmp_path / "cli_run"
+    exit_code = cli_main(
+        [
+            "--root",
+            str(root_dir()),
+            "run",
+            "--task",
+            "IF-01",
+            "--case",
+            "case_001",
+            "--agent-cmd",
+            f"{sys.executable} scripts/mock_agent_write_artifacts.py",
+            "--out",
+            str(out_dir),
+        ]
+    )
+    output = json.loads(capsys.readouterr().out)
+
+    assert exit_code == 0
+    assert output["status"] == "passed"
+    assert output["success"] is True
+    assert (out_dir / "run.json").exists()