From efa00d532ef91af8b161013248259794df37020c Mon Sep 17 00:00:00 2001 From: "Vitaly D." Date: Mon, 25 May 2026 19:57:37 +0300 Subject: [PATCH] feat(runner): add local command runner MVP Why: - Agent Bench Lab needs a product-neutral way to run real external agent setups against existing task families, not only score prebuilt sample artifacts. - The runner must preserve the benchmark visibility boundary by separating agent-visible task packets from scorer-visible fixtures. What changed: - Add `agent-bench run` and a local command runner that creates `task_packet/`, `artifacts/`, `run.json`, `trace.jsonl`, and `score.json`. - Exclude scorer-only/private-looking files from task packets, including `check_config.json`, while scoring against the original fixture and produced artifacts. - Add a mock smoke agent, runner tests, `make run-smoke`, CI wiring, schema event/status updates, and docs. Testing: - `make validate` - `make test` - `make smoke` - `make compare-smoke` - `make if01-smoke` - `make data01-smoke` - `make doc01-smoke` - `make sup01-smoke` - `make api01-smoke` - `make lifecycle-check` - `make mutation-smoke` - `make hardening-check` - `make run-smoke` - `make leak-check` - `python3 -m ruff check .` - `git diff --check` - tracked-file audit for private/generated/sensitive paths Risk: - moderate - command execution is intentionally local and generic; future provider/browser/MCP adapters should not bypass the task-packet visibility boundary. Related: #22 --- .github/workflows/ci.yml | 3 + Makefile | 7 +- README.md | 20 ++ docs/21-local-agent-runner.md | 140 ++++++++++ docs/README.md | 7 +- schemas/run.schema.json | 1 + schemas/trace_event.schema.json | 8 + scripts/mock_agent_write_artifacts.py | 38 +++ src/agent_bench_lab/cli.py | 41 +++ src/agent_bench_lab/runner.py | 376 ++++++++++++++++++++++++++ tests/test_runner.py | 166 ++++++++++++ 11 files changed, 803 insertions(+), 4 deletions(-) create mode 100644 docs/21-local-agent-runner.md create mode 100644 scripts/mock_agent_write_artifacts.py create mode 100644 src/agent_bench_lab/runner.py create mode 100644 tests/test_runner.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2c9064..4618f71 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,6 +60,9 @@ jobs: - name: Run hardening check run: make hardening-check + - name: Run local agent runner smoke + run: make run-smoke + - name: Run leak check run: make leak-check diff --git a/Makefile b/Makefile index 80dcc7a..d7cc3ed 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON ?= python3 PYTHONPATH ?= src -.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check leak-check test +.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check run-smoke leak-check test validate: PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli validate @@ -67,6 +67,11 @@ mutation-smoke: hardening-check: $(PYTHON) scripts/check_hardening_gates.py +run-smoke: + PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task IF-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/IF-01_case_001 + PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli run --task DATA-01 --case case_001 --agent-cmd "$(PYTHON) scripts/mock_agent_write_artifacts.py" --out runs/manual/mock/DATA-01_case_001 + PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli compare --baseline runs/manual/mock --candidate runs/manual/mock --out reports/generated/compare_run_smoke.md + leak-check: $(PYTHON) scripts/public_leak_check.py . diff --git a/README.md b/README.md index 6999e51..e3ebe33 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ This repository is a **v0 public starter**. It contains: - minimal Python CLI scaffolding; - sample public fixtures; - sample scorers plus hardened IF-01, DATA-01, DOC-01, SUP-01, and API-01 artifact/state-based scorers; +- a local command-based runner for external agent setups; - documentation for benchmark design, metrics, anti-overfitting, lifecycle status, hardening gates, and research radar process. It intentionally does **not** contain private holdout tasks, production secrets, personal data, or benchmark answers for real evaluation runs. @@ -176,6 +177,24 @@ make leak-check The examples directory intentionally starts mostly empty. Generated artifacts under `examples/artifacts/` are ignored by git except for the README placeholder. +## Run an external agent setup + +Use `agent-bench run` to hand an agent-visible task packet to any local command and score the artifacts it writes: + +```bash +agent-bench run \ + --task IF-01 \ + --case case_001 \ + --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \ + --out runs/manual/mock/IF-01_case_001 +``` + +The command receives `AGENT_BENCH_TASK_PACKET` and `AGENT_BENCH_ARTIFACTS_DIR`. It should write final artifacts to the artifacts directory. The runner then writes `run.json`, `trace.jsonl`, and `score.json`. + +The task packet excludes scorer-only files such as `check_config.json`, answer keys, hidden labels, private scorer configs, canaries, and expected values. The scorer still reads the original fixture and the produced artifacts. + +See [Local Agent Runner MVP](docs/21-local-agent-runner.md). + ## Compare two agent setups Create two local smoke-run directories and compare them: @@ -314,6 +333,7 @@ agent-bench-lab/ - [DOC-01 decision-grade pattern](docs/13-doc01-decision-grade.md) - [SUP-01 decision-grade pattern](docs/14-sup01-decision-grade.md) - [API-01 decision-grade pattern](docs/15-api01-decision-grade.md) +- [Local Agent Runner MVP](docs/21-local-agent-runner.md) - [Public release checklist](docs/public-release-checklist.md) - [v0 roadmap](docs/roadmap-v0.md) diff --git a/docs/21-local-agent-runner.md b/docs/21-local-agent-runner.md new file mode 100644 index 0000000..11ff366 --- /dev/null +++ b/docs/21-local-agent-runner.md @@ -0,0 +1,140 @@ +# Local Agent Runner MVP + +The local runner lets Agent Bench Lab run an external command against an existing public task case. + +It is a command adapter, not a provider adapter. + +It does not implement OpenAI, Anthropic, MCP, browser automation, private bundle mounting, scheduled evals, or a sandbox. Any local agent setup can be wrapped as a command as long as it writes artifacts to the artifacts directory provided by the runner. + +## Command + +```bash +agent-bench run \ + --task IF-01 \ + --case case_001 \ + --agent-cmd "python3 scripts/mock_agent_write_artifacts.py" \ + --out runs/manual/mock/IF-01_case_001 +``` + +The command creates: + +```text +runs/manual/mock/IF-01_case_001/ + run.json + score.json + trace.jsonl + artifacts/ + task_packet/ +``` + +If `--out` is omitted, the runner writes under `runs/manual/...`. Local run outputs are ignored by git. + +## Environment + +The external command receives: + +```text +AGENT_BENCH_TASK_ID +AGENT_BENCH_CASE_ID +AGENT_BENCH_RUN_ID +AGENT_BENCH_TASK_PACKET +AGENT_BENCH_ARTIFACTS_DIR +AGENT_BENCH_AGENT_CONFIG +``` + +The agent command should write final artifacts to: + +```text +$AGENT_BENCH_ARTIFACTS_DIR +``` + +## Visibility Boundary + +The runner keeps the agent-visible task packet separate from the scorer-visible fixture. + +The agent may see: + +- task prompt; +- public task metadata; +- public case spec; +- safe public fixture inputs such as data, corpora, inbox files, API catalogs, policies, and state fixtures. + +The agent must not see: + +- `check_config.json`; +- answer keys; +- hidden labels; +- private scorer configs; +- canaries; +- expected values; +- private holdouts; +- private eval bundle contents. + +The scorer still receives the original fixture directory and the produced artifact directory. This keeps public smoke runs aligned with the same visibility model needed for private holdouts. + +## Run and Score Records + +`run.json` records: + +- run id; +- task and case id; +- task version; +- agent config id/hash; +- command hash; +- timing; +- status; +- score summary; +- paths to task packet, artifacts, score, and trace. + +`trace.jsonl` records minimal runner lifecycle events: + +- `run_started`; +- `task_packet_created`; +- `agent_command_started`; +- `agent_command_completed`; +- `agent_command_timeout`; +- `scorer_started`; +- `scorer_completed`; +- `run_completed`. + +Command output snippets are bounded and redacted. The runner does not store raw secrets or private scorer-only content. + +## Compare Setups + +Run two agent setups into separate directories, then compare the resulting `score.json` files: + +```bash +agent-bench run \ + --task DATA-01 \ + --case case_001 \ + --agent-cmd "python3 my_agent_a.py" \ + --out runs/setup_a/DATA-01_case_001 + +agent-bench run \ + --task DATA-01 \ + --case case_001 \ + --agent-cmd "python3 my_agent_b.py" \ + --out runs/setup_b/DATA-01_case_001 + +agent-bench compare \ + --baseline runs/setup_a \ + --candidate runs/setup_b +``` + +## Smoke Agent + +`scripts/mock_agent_write_artifacts.py` is a test helper. It writes valid public sample artifacts for smoke cases into `$AGENT_BENCH_ARTIFACTS_DIR`. + +It is not a benchmarked agent and should not be used as evidence of agent capability. + +## Non-Goals + +The MVP intentionally does not provide: + +- provider-specific adapters; +- live SaaS or MCP integrations; +- browser or office workflow runners; +- private holdout storage; +- private bundle runtime; +- automatic GitHub issues or commits; +- scheduled evals. diff --git a/docs/README.md b/docs/README.md index 473b8de..8d2e64e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -26,6 +26,7 @@ Start here: 22. [Suite strategy](18-suite-strategy.md) 23. [Report schema v1 guidance](19-report-schema-v1.md) 24. [Research Radar](20-research-radar.md) -25. [v0 roadmap](roadmap-v0.md) -26. [Public release checklist](public-release-checklist.md) -27. [Decision log template](decision-log-template.md) +25. [Local Agent Runner MVP](21-local-agent-runner.md) +26. [v0 roadmap](roadmap-v0.md) +27. [Public release checklist](public-release-checklist.md) +28. [Decision log template](decision-log-template.md) diff --git a/schemas/run.schema.json b/schemas/run.schema.json index bf481ac..a77ccf5 100644 --- a/schemas/run.schema.json +++ b/schemas/run.schema.json @@ -50,6 +50,7 @@ "passed", "failed", "timeout", + "error", "invalid", "environment_error" ] diff --git a/schemas/trace_event.schema.json b/schemas/trace_event.schema.json index 8d03448..41e4208 100644 --- a/schemas/trace_event.schema.json +++ b/schemas/trace_event.schema.json @@ -23,6 +23,14 @@ "tool_result", "artifact_write", "scorer_event", + "run_started", + "task_packet_created", + "agent_command_started", + "agent_command_completed", + "agent_command_timeout", + "scorer_started", + "scorer_completed", + "run_completed", "error" ] }, diff --git a/scripts/mock_agent_write_artifacts.py b/scripts/mock_agent_write_artifacts.py new file mode 100644 index 0000000..610e02a --- /dev/null +++ b/scripts/mock_agent_write_artifacts.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import os +import runpy +import shutil +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] + + +def copy_tree_contents(source: Path, destination: Path) -> None: + destination.mkdir(parents=True, exist_ok=True) + for path in sorted(source.rglob("*")): + if not path.is_file(): + continue + target = destination / path.relative_to(source) + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, target) + + +def main() -> int: + task_id = os.environ.get("AGENT_BENCH_TASK_ID") + case_id = os.environ.get("AGENT_BENCH_CASE_ID") + artifacts_dir = os.environ.get("AGENT_BENCH_ARTIFACTS_DIR") + if not task_id or not case_id or not artifacts_dir: + raise SystemExit("missing AGENT_BENCH_TASK_ID, AGENT_BENCH_CASE_ID, or AGENT_BENCH_ARTIFACTS_DIR") + + runpy.run_path(str(ROOT / "scripts" / "create_sample_artifacts.py"), run_name="__main__") + source = ROOT / "examples" / "artifacts" / task_id / case_id + if not source.is_dir(): + raise SystemExit(f"no sample artifacts for {task_id}/{case_id}") + copy_tree_contents(source, Path(artifacts_dir)) + print(f"mock artifacts written for {task_id}/{case_id}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/agent_bench_lab/cli.py b/src/agent_bench_lab/cli.py index 7051cca..ed3e274 100644 --- a/src/agent_bench_lab/cli.py +++ b/src/agent_bench_lab/cli.py @@ -7,6 +7,7 @@ from .compare import compare_score_dirs, render_markdown_report, write_csv_report from .registry import list_tasks, repo_root_from, validate_all +from .runner import run_agent_task from .scoring import score_task, write_score @@ -73,6 +74,37 @@ def cmd_compare(args: argparse.Namespace) -> int: return 1 if result["missing_scores"] else 0 +def cmd_run(args: argparse.Namespace) -> int: + root = repo_root_from(args.root) + agent_config = Path(args.agent_config).resolve() if args.agent_config else None + out_dir = Path(args.out).resolve() if args.out else None + try: + run_record = run_agent_task( + root=root, + task_id=args.task, + case_id=args.case, + agent_cmd=args.agent_cmd, + agent_config_path=agent_config, + out_dir=out_dir, + timeout_seconds=args.timeout, + ) + except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + summary = { + "run_id": run_record["run_id"], + "task_id": run_record["task_id"], + "case_id": run_record["case_id"], + "status": run_record["status"], + "score": run_record.get("score"), + "success": run_record.get("success"), + "output_path": str(Path(run_record["paths"]["artifacts"]).parent), + } + print(json.dumps(summary, indent=2, ensure_ascii=False)) + return 0 if run_record["status"] == "passed" else 2 + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="agent-bench") parser.add_argument("--root", default=".", help="Repository root") @@ -100,6 +132,15 @@ def build_parser() -> argparse.ArgumentParser: p_compare.add_argument("--csv") p_compare.set_defaults(func=cmd_compare) + p_run = sub.add_parser("run", help="Run an external command against a task case") + p_run.add_argument("--task", required=True) + p_run.add_argument("--case", default="case_001") + p_run.add_argument("--agent-cmd", required=True) + p_run.add_argument("--agent-config") + p_run.add_argument("--out") + p_run.add_argument("--timeout", type=int, default=600) + p_run.set_defaults(func=cmd_run) + return parser diff --git a/src/agent_bench_lab/runner.py b/src/agent_bench_lab/runner.py new file mode 100644 index 0000000..563dfdd --- /dev/null +++ b/src/agent_bench_lab/runner.py @@ -0,0 +1,376 @@ +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from datetime import UTC, datetime +from hashlib import sha256 +from pathlib import Path +from typing import Any + +from .redaction import redact_text +from .registry import load_task, repo_root_from +from .run_records import load_agent_config, load_task_version +from .scoring import score_task, write_score + +RUN_STATUSES = {"passed", "failed", "timeout", "error"} +TRACE_SNIPPET_CHARS = 2000 +AGENT_VISIBLE_TASK_FILES = ("prompt.md", "task.json") +SCORER_ONLY_FILENAMES = { + "check_config.json", +} +SCORER_ONLY_PATTERNS = ( + "answer_key", + "hidden_label", + "private", + "canary", + "scorer_config", + "expected", + "rubric_private", +) + + +def utc_now() -> str: + return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def command_hash(agent_cmd: str) -> str: + return sha256(agent_cmd.encode("utf-8")).hexdigest() + + +def safe_slug(value: str) -> str: + return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in value) + + +def safe_snippet(text: str | None, limit: int = TRACE_SNIPPET_CHARS) -> str: + if not text: + return "" + return redact_text(text[:limit]) + + +def is_agent_visible_path(path: Path) -> bool: + if path.name.lower() in SCORER_ONLY_FILENAMES: + return False + lowered = path.as_posix().lower() + return not any(pattern in lowered for pattern in SCORER_ONLY_PATTERNS) + + +def copy_public_file(source: Path, destination: Path) -> None: + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) + + +def write_json(path: Path, data: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + + +def append_trace_event( + trace_path: Path, + *, + run_id: str, + event_type: str, + actor: str, + metadata: dict[str, Any] | None = None, +) -> None: + trace_path.parent.mkdir(parents=True, exist_ok=True) + event = { + "ts": utc_now(), + "run_id": run_id, + "event_type": event_type, + "actor": actor, + "metadata": metadata or {}, + } + with trace_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(event, ensure_ascii=False) + "\n") + + +def create_task_packet( + *, + root: Path, + task_id: str, + case_id: str, + task_dir: Path, + fixture_dir: Path, + packet_dir: Path, +) -> dict[str, Any]: + if packet_dir.exists(): + shutil.rmtree(packet_dir) + packet_dir.mkdir(parents=True) + + included: list[str] = [] + excluded_count = 0 + + for filename in AGENT_VISIBLE_TASK_FILES: + source = task_dir / filename + if not source.exists(): + continue + destination = packet_dir / filename + copy_public_file(source, destination) + included.append(destination.relative_to(packet_dir).as_posix()) + + for source in sorted(fixture_dir.rglob("*")): + if not source.is_file() or source.is_symlink(): + continue + rel_fixture_path = source.relative_to(fixture_dir) + if not is_agent_visible_path(rel_fixture_path): + excluded_count += 1 + continue + destination = packet_dir / "fixture" / rel_fixture_path + copy_public_file(source, destination) + included.append(destination.relative_to(packet_dir).as_posix()) + + prompt_source = task_dir / "prompt.md" + spec_source = fixture_dir / "spec.md" + prompt_parts = [] + if prompt_source.exists(): + prompt_parts.append(prompt_source.read_text(encoding="utf-8")) + if spec_source.exists() and is_agent_visible_path(Path("spec.md")): + prompt_parts.extend(["", "## Case Spec", "", spec_source.read_text(encoding="utf-8")]) + task_prompt = "\n".join(prompt_parts).strip() + "\n" + (packet_dir / "task_prompt.md").write_text(task_prompt, encoding="utf-8") + included.append("task_prompt.md") + + readme = ( + "# Agent Bench Task Packet\n\n" + "This directory contains only agent-visible task instructions and public fixture inputs.\n" + "Write final artifacts to the path in `AGENT_BENCH_ARTIFACTS_DIR`.\n\n" + "Scorer-only files such as `check_config.json`, hidden labels, answer keys, private scorer " + "configs, canaries, and expected values are intentionally excluded.\n" + ) + (packet_dir / "README.md").write_text(readme, encoding="utf-8") + included.append("README.md") + + manifest = { + "task_id": task_id, + "case_id": case_id, + "source_fixture": (fixture_dir.relative_to(root)).as_posix(), + "included_files": sorted(set(included)), + "excluded_file_count": excluded_count, + "excluded_reason": "scorer_or_private_visibility_boundary", + "visibility": "agent", + "scorer_only_files_excluded": True, + } + write_json(packet_dir / "manifest.json", manifest) + return manifest + + +def default_out_dir(root: Path, agent_config_id: str, task_id: str, case_id: str, run_id: str) -> Path: + return root / "runs" / "manual" / safe_slug(agent_config_id) / f"{task_id}_{case_id}_{run_id}" + + +def build_run_id(agent_config_id: str, task_id: str, case_id: str) -> str: + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + return safe_slug(f"{agent_config_id}_{task_id}_{case_id}_{timestamp}") + + +def run_agent_task( + task_id: str, + case_id: str, + agent_cmd: str, + agent_config_path: Path | None, + out_dir: Path | None = None, + timeout_seconds: int = 600, + root: Path | None = None, +) -> dict[str, Any]: + root = repo_root_from(root) + task_dir = root / "tasks" / task_id + fixture_dir = root / "fixtures" / "public" / task_id / case_id + if not task_dir.is_dir(): + raise FileNotFoundError(f"Unknown task: {task_id}") + if not fixture_dir.is_dir(): + raise FileNotFoundError(f"Missing public fixture for {task_id}/{case_id}: {fixture_dir}") + + task = load_task(task_dir) + agent_config_id, agent_config_hash = load_agent_config(agent_config_path) + run_id = build_run_id(agent_config_id, task_id, case_id) + resolved_out_dir = out_dir.resolve() if out_dir else default_out_dir( + root, agent_config_id, task_id, case_id, run_id + ) + resolved_out_dir.mkdir(parents=True, exist_ok=True) + + task_packet_dir = resolved_out_dir / "task_packet" + artifacts_dir = resolved_out_dir / "artifacts" + score_path = resolved_out_dir / "score.json" + run_path = resolved_out_dir / "run.json" + trace_path = resolved_out_dir / "trace.jsonl" + artifacts_dir.mkdir(parents=True, exist_ok=True) + if trace_path.exists(): + trace_path.unlink() + + started_at = utc_now() + append_trace_event(trace_path, run_id=run_id, event_type="run_started", actor="runner") + manifest = create_task_packet( + root=root, + task_id=task_id, + case_id=case_id, + task_dir=task_dir, + fixture_dir=fixture_dir, + packet_dir=task_packet_dir, + ) + append_trace_event( + trace_path, + run_id=run_id, + event_type="task_packet_created", + actor="runner", + metadata={"included_files": len(manifest["included_files"])}, + ) + + env = os.environ.copy() + env.update( + { + "AGENT_BENCH_TASK_ID": task_id, + "AGENT_BENCH_CASE_ID": case_id, + "AGENT_BENCH_RUN_ID": run_id, + "AGENT_BENCH_TASK_PACKET": str(task_packet_dir), + "AGENT_BENCH_ARTIFACTS_DIR": str(artifacts_dir), + "AGENT_BENCH_AGENT_CONFIG": str(agent_config_path or ""), + } + ) + + returncode: int | None = None + stdout_snippet = "" + stderr_snippet = "" + status = "failed" + append_trace_event( + trace_path, + run_id=run_id, + event_type="agent_command_started", + actor="agent", + metadata={"agent_cmd_hash": command_hash(agent_cmd)}, + ) + try: + completed = subprocess.run( + agent_cmd, + shell=True, + cwd=root, + env=env, + text=True, + capture_output=True, + timeout=timeout_seconds, + check=False, + ) + returncode = completed.returncode + stdout_snippet = safe_snippet(completed.stdout) + stderr_snippet = safe_snippet(completed.stderr) + append_trace_event( + trace_path, + run_id=run_id, + event_type="agent_command_completed", + actor="agent", + metadata={ + "returncode": returncode, + "stdout_snippet": stdout_snippet, + "stderr_snippet": stderr_snippet, + }, + ) + except subprocess.TimeoutExpired as exc: + status = "timeout" + stdout = ( + exc.stdout.decode("utf-8", errors="replace") + if isinstance(exc.stdout, bytes) + else exc.stdout + ) + stderr = ( + exc.stderr.decode("utf-8", errors="replace") + if isinstance(exc.stderr, bytes) + else exc.stderr + ) + stdout_snippet = safe_snippet(stdout) + stderr_snippet = safe_snippet(stderr) + append_trace_event( + trace_path, + run_id=run_id, + event_type="agent_command_timeout", + actor="agent", + metadata={ + "timeout_seconds": timeout_seconds, + "stdout_snippet": stdout_snippet, + "stderr_snippet": stderr_snippet, + }, + ) + + append_trace_event(trace_path, run_id=run_id, event_type="scorer_started", actor="scorer") + try: + score = score_task( + root=root, + task_id=task_id, + case_id=case_id, + artifacts_dir=artifacts_dir, + agent_config_path=agent_config_path, + run_id=run_id, + ) + except Exception as exc: # noqa: BLE001 - runner must preserve local failure records. + score = { + "run_id": run_id, + "task_id": task_id, + "case_id": case_id, + "task_version": load_task_version(task_dir), + "scorer_version": "error", + "agent_config_id": agent_config_id, + "agent_config_hash": agent_config_hash, + "success": False, + "score": 0.0, + "pass_threshold": 0.8, + "components": {}, + "policy_violations": [], + "errors": [redact_text(str(exc))], + "artifact_hashes": {}, + "metadata": { + "latency_seconds": None, + "cost_usd": None, + "tool_calls": None, + "model_calls": None, + "notes": None, + }, + } + write_score(score, score_path) + append_trace_event( + trace_path, + run_id=run_id, + event_type="scorer_completed", + actor="scorer", + metadata={"score": score.get("score"), "success": score.get("success")}, + ) + + if status != "timeout": + status = "passed" if returncode == 0 and score.get("success") else "failed" + completed_at = utc_now() + run_record = { + "run_id": run_id, + "task_id": task_id, + "case_id": case_id, + "task_version": load_task_version(task_dir), + "task_name": task.get("name"), + "agent_config_id": agent_config_id, + "agent_config_hash": agent_config_hash, + "agent_cmd_hash": command_hash(agent_cmd), + "started_at": started_at, + "completed_at": completed_at, + "status": status, + "success": bool(score.get("success")) and status == "passed", + "score": score.get("score"), + "timeout_seconds": timeout_seconds, + "paths": { + "task_packet": str(task_packet_dir), + "artifacts": str(artifacts_dir), + "score": str(score_path), + "trace": str(trace_path), + }, + "command": { + "returncode": returncode, + "agent_cmd_redacted": redact_text(agent_cmd), + "stdout_snippet": stdout_snippet, + "stderr_snippet": stderr_snippet, + }, + } + write_json(run_path, run_record) + append_trace_event( + trace_path, + run_id=run_id, + event_type="run_completed", + actor="runner", + metadata={"status": status, "score": score.get("score"), "success": run_record["success"]}, + ) + return run_record diff --git a/tests/test_runner.py b/tests/test_runner.py new file mode 100644 index 0000000..48a9d2e --- /dev/null +++ b/tests/test_runner.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path + +from agent_bench_lab.cli import main as cli_main +from agent_bench_lab.runner import create_task_packet, run_agent_task + + +def root_dir() -> Path: + return Path(__file__).resolve().parents[1] + + +def test_task_packet_excludes_check_config(tmp_path): + out_dir = tmp_path / "packet_run" + run_record = run_agent_task( + root=root_dir(), + task_id="IF-01", + case_id="case_001", + agent_cmd=f"{sys.executable} -c \"pass\"", + agent_config_path=None, + out_dir=out_dir, + timeout_seconds=30, + ) + packet_dir = Path(run_record["paths"]["task_packet"]) + + assert (packet_dir / "fixture" / "spec.md").exists() + assert not (packet_dir / "fixture" / "check_config.json").exists() + assert "check_config.json" not in (packet_dir / "manifest.json").read_text(encoding="utf-8") + + +def test_task_packet_excludes_denylisted_scorer_only_files(tmp_path): + task_dir = tmp_path / "tasks" / "T-01" + fixture_dir = tmp_path / "fixtures" / "public" / "T-01" / "case_001" + packet_dir = tmp_path / "packet" + task_dir.mkdir(parents=True) + fixture_dir.mkdir(parents=True) + (task_dir / "prompt.md").write_text("Prompt", encoding="utf-8") + (task_dir / "task.json").write_text("{}", encoding="utf-8") + (fixture_dir / "spec.md").write_text("Spec", encoding="utf-8") + (fixture_dir / "answer_key.json").write_text("{}", encoding="utf-8") + (fixture_dir / "hidden_label.txt").write_text("label", encoding="utf-8") + (fixture_dir / "data.csv").write_text("id,value\n1,2\n", encoding="utf-8") + + manifest = create_task_packet( + root=tmp_path, + task_id="T-01", + case_id="case_001", + task_dir=task_dir, + fixture_dir=fixture_dir, + packet_dir=packet_dir, + ) + + assert (packet_dir / "fixture" / "data.csv").exists() + assert not (packet_dir / "fixture" / "answer_key.json").exists() + assert not (packet_dir / "fixture" / "hidden_label.txt").exists() + assert manifest["excluded_file_count"] == 2 + + +def test_runner_writes_run_trace_and_score_with_mock_agent(tmp_path): + out_dir = tmp_path / "run" + run_record = run_agent_task( + root=root_dir(), + task_id="IF-01", + case_id="case_001", + agent_cmd=f"{sys.executable} scripts/mock_agent_write_artifacts.py", + agent_config_path=None, + out_dir=out_dir, + timeout_seconds=60, + ) + + assert run_record["status"] == "passed" + assert (out_dir / "run.json").exists() + assert (out_dir / "trace.jsonl").exists() + assert (out_dir / "score.json").exists() + assert (out_dir / "artifacts" / "artifact.md").exists() + score = json.loads((out_dir / "score.json").read_text(encoding="utf-8")) + assert score["success"] + + +def test_runner_handles_agent_command_timeout(tmp_path): + out_dir = tmp_path / "timeout" + run_record = run_agent_task( + root=root_dir(), + task_id="IF-01", + case_id="case_001", + agent_cmd=f"{sys.executable} -c \"import time; time.sleep(2)\"", + agent_config_path=None, + out_dir=out_dir, + timeout_seconds=1, + ) + + assert run_record["status"] == "timeout" + assert (out_dir / "run.json").exists() + assert (out_dir / "score.json").exists() + assert "agent_command_timeout" in (out_dir / "trace.jsonl").read_text(encoding="utf-8") + + +def test_runner_handles_missing_artifacts_gracefully(tmp_path): + out_dir = tmp_path / "missing_artifacts" + run_record = run_agent_task( + root=root_dir(), + task_id="IF-01", + case_id="case_001", + agent_cmd=f"{sys.executable} -c \"print('no artifacts')\"", + agent_config_path=None, + out_dir=out_dir, + timeout_seconds=30, + ) + + assert run_record["status"] == "failed" + score = json.loads((out_dir / "score.json").read_text(encoding="utf-8")) + assert not score["success"] + assert score["score"] < score["pass_threshold"] + + +def test_runner_redacts_unsafe_stdout_and_stderr(tmp_path): + out_dir = tmp_path / "redacted" + command = ( + f"{sys.executable} -c \"import sys; " + "print('correct answer was x expected=y CANARY_123'); " + "print('api_key=secret', file=sys.stderr)\"" + ) + run_record = run_agent_task( + root=root_dir(), + task_id="IF-01", + case_id="case_001", + agent_cmd=command, + agent_config_path=None, + out_dir=out_dir, + timeout_seconds=30, + ) + trace_text = (out_dir / "trace.jsonl").read_text(encoding="utf-8") + run_text = json.dumps(run_record) + + assert "[REDACTED]" in trace_text + assert "CANARY_123" not in trace_text + assert "api_key" not in trace_text + assert "CANARY_123" not in run_text + assert "api_key" not in run_text + + +def test_cli_agent_bench_run_works_with_mock_agent(tmp_path, capsys): + out_dir = tmp_path / "cli_run" + exit_code = cli_main( + [ + "--root", + str(root_dir()), + "run", + "--task", + "IF-01", + "--case", + "case_001", + "--agent-cmd", + f"{sys.executable} scripts/mock_agent_write_artifacts.py", + "--out", + str(out_dir), + ] + ) + output = json.loads(capsys.readouterr().out) + + assert exit_code == 0 + assert output["status"] == "passed" + assert output["success"] is True + assert (out_dir / "run.json").exists()