diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f039a61..b2c9064 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,15 @@ jobs: - name: Run API-01 smoke run: make api01-smoke + - name: Run lifecycle check + run: make lifecycle-check + + - name: Run mutation smoke + run: make mutation-smoke + + - name: Run hardening check + run: make hardening-check + - name: Run leak check run: make leak-check diff --git a/Makefile b/Makefile index e66fa0a..80dcc7a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON ?= python3 PYTHONPATH ?= src -.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke leak-check test +.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check leak-check test validate: PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli validate @@ -58,6 +58,15 @@ api01-smoke: $(PYTHON) scripts/create_api01_mutation.py --out artifacts/mutations/API-01/case_mutation_001 PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m pytest -q tests/test_api01.py +lifecycle-check: + $(PYTHON) scripts/check_lifecycle.py + +mutation-smoke: + $(PYTHON) scripts/run_mutation_smoke.py + +hardening-check: + $(PYTHON) scripts/check_hardening_gates.py + leak-check: $(PYTHON) scripts/public_leak_check.py . diff --git a/README.md b/README.md index ac4f2e9..e46cf79 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,20 @@ The Private Eval Layer holds hidden labels, private holdouts, answer keys, prote See [Private Eval Layer](docs/private-eval-layer.md), [Scorer type contracts](docs/scorer-types.md), and [Reporting and feedback](docs/reporting-and-feedback.md). +## Benchmark lifecycle and hardening gates + +After the first five decision-grade public patterns, v0.6 adds standard-layer gates instead of another task family. + +Lifecycle metadata declares whether each task family is `experimental`, `decision-grade`, `verified`, or `deprecated`. Hardening metadata declares mutation smoke scripts and exploit smoke status for decision-grade families. No task is marked `verified` yet. + +```bash +make lifecycle-check +make mutation-smoke +make hardening-check +``` + +See [Benchmark lifecycle](docs/16-benchmark-lifecycle.md), [Mutation and exploit gates](docs/17-mutation-and-exploit-gates.md), [Suite strategy](docs/18-suite-strategy.md), and [Report schema v1 guidance](docs/19-report-schema-v1.md). + ## Current status This repository is a **v0 public starter**. It contains: @@ -67,7 +81,7 @@ This repository is a **v0 public starter**. It contains: - minimal Python CLI scaffolding; - sample public fixtures; - sample scorers plus hardened IF-01, DATA-01, DOC-01, SUP-01, and API-01 artifact/state-based scorers; -- documentation for benchmark design, metrics, and anti-overfitting. +- documentation for benchmark design, metrics, anti-overfitting, lifecycle status, and hardening gates. It intentionally does **not** contain private holdout tasks, production secrets, personal data, or benchmark answers for real evaluation runs. @@ -140,6 +154,9 @@ Without installing the package, use the source-tree Make targets: make validate make test make smoke +make lifecycle-check +make mutation-smoke +make hardening-check make leak-check ``` diff --git a/configs/hardening_gates.json b/configs/hardening_gates.json new file mode 100644 index 0000000..33a66a5 --- /dev/null +++ b/configs/hardening_gates.json @@ -0,0 +1,86 @@ +{ + "version": "0.6.0", + "schema_version": 1, + "tasks": { + "IF-01": { + "task_id": "IF-01", + "mutation_smoke_required": true, + "mutation_script": "scripts/create_if01_mutation.py", + "mutation_output": "artifacts/mutation-smoke/IF-01/case_mutation_001", + "expected_output_files": [ + "spec.md", + "check_config.json" + ], + "exploit_smoke_required": true, + "exploit_smoke_status": "planned", + "reason": "IF-01 already tests extra files, forbidden sections, and banned phrases; v0.6 records the gate and future standard exploit smoke shape.", + "public_safe": true + }, + "DATA-01": { + "task_id": "DATA-01", + "mutation_smoke_required": true, + "mutation_script": "scripts/create_data01_mutation.py", + "mutation_output": "artifacts/mutation-smoke/DATA-01/case_mutation_001", + "expected_output_files": [ + "spec.md", + "check_config.json", + "data/events.csv", + "data/customers.csv", + "data/analytics.db" + ], + "exploit_smoke_required": true, + "exploit_smoke_status": "planned", + "reason": "DATA-01 already checks unsupported metrics and invalid artifacts; v0.6 keeps exploit gate status explicit without adding private data.", + "public_safe": true + }, + "DOC-01": { + "task_id": "DOC-01", + "mutation_smoke_required": true, + "mutation_script": "scripts/create_doc01_mutation.py", + "mutation_output": "artifacts/mutation-smoke/DOC-01/case_mutation_001", + "expected_output_files": [ + "spec.md", + "check_config.json", + "corpus/product_policy.md" + ], + "exploit_smoke_required": true, + "exploit_smoke_status": "planned", + "reason": "DOC-01 already checks unsupported claims, stale sources, and citation evidence; standard exploit smoke remains a declared next hardening layer.", + "public_safe": true + }, + "SUP-01": { + "task_id": "SUP-01", + "mutation_smoke_required": true, + "mutation_script": "scripts/create_sup01_mutation.py", + "mutation_output": "artifacts/mutation-smoke/SUP-01/case_mutation_001", + "expected_output_files": [ + "spec.md", + "check_config.json", + "policy.md", + "customer_profile.json", + "inbox/email_001.eml" + ], + "exploit_smoke_required": true, + "exploit_smoke_status": "planned", + "reason": "SUP-01 already checks prohibited promises and scorer-only labels; private prompt-injection and canary gates stay outside the public repo.", + "public_safe": true + }, + "API-01": { + "task_id": "API-01", + "mutation_smoke_required": true, + "mutation_script": "scripts/create_api01_mutation.py", + "mutation_output": "artifacts/mutation-smoke/API-01/case_mutation_001", + "expected_output_files": [ + "spec.md", + "check_config.json", + "api_catalog.json", + "api_state.json", + "policy.md" + ], + "exploit_smoke_required": true, + "exploit_smoke_status": "planned", + "reason": "API-01 already checks forbidden endpoints and wrong state mutations; future private gates can add trap endpoints and canaries.", + "public_safe": true + } + } +} diff --git a/configs/task_lifecycle.json b/configs/task_lifecycle.json new file mode 100644 index 0000000..3f8d6be --- /dev/null +++ b/configs/task_lifecycle.json @@ -0,0 +1,224 @@ +{ + "version": "0.6.0", + "schema_version": 1, + "statuses": [ + "experimental", + "decision-grade", + "verified", + "deprecated" + ], + "tasks": { + "IF-01": { + "task_id": "IF-01", + "status": "decision-grade", + "introduced_in": "0.1.0", + "current_version": "0.1.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "artifact_exact", + "scorer_contracts": [ + "artifact_exact", + "schema_contract", + "mutation_robustness" + ], + "public_cases": true, + "docs_reference": "docs/11-if01-decision-grade.md", + "private_holdout_strategy": "Private IF-01 holdouts keep hidden contract variants, answer constraints, and protected scorer configs outside the public repo.", + "mutation_strategy": "Use create_if01_mutation.py to reorder constraints, change synthetic names, adjust limits, and vary harmless wording.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "First decision-grade public pattern for strict instruction following and artifact contracts." + }, + "DATA-01": { + "task_id": "DATA-01", + "status": "decision-grade", + "introduced_in": "0.2.0", + "current_version": "0.2.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "numeric_metric", + "scorer_contracts": [ + "artifact_exact", + "schema_contract", + "numeric_metric", + "claim_rubric", + "mutation_robustness" + ], + "public_cases": true, + "docs_reference": "docs/12-data01-decision-grade.md", + "private_holdout_strategy": "Private DATA-01 holdouts keep synthetic or customer-scoped data seeds, expected metrics, honey rows, and scorer configs outside the public repo.", + "mutation_strategy": "Use create_data01_mutation.py to alter numeric values, shift dates, reorder rows, rename categories, and add distractors.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Decision-grade public pattern for exact data work, factual memos, and chart specifications." + }, + "DOC-01": { + "task_id": "DOC-01", + "status": "decision-grade", + "introduced_in": "0.3.0", + "current_version": "0.3.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "claim_rubric", + "scorer_contracts": [ + "artifact_exact", + "schema_contract", + "claim_rubric", + "mutation_robustness" + ], + "public_cases": true, + "docs_reference": "docs/13-doc01-decision-grade.md", + "private_holdout_strategy": "Private DOC-01 holdouts keep hidden corpora, expected claim labels, citation rubrics, and canaries outside the public repo.", + "mutation_strategy": "Use create_doc01_mutation.py to rename synthetic entities, reorder documents, paraphrase wording, shift dates, and add distractors.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Decision-grade public pattern for fixed-corpus grounded answers and citation checks." + }, + "SUP-01": { + "task_id": "SUP-01", + "status": "decision-grade", + "introduced_in": "0.4.0", + "current_version": "0.4.0", + "suite_ids": [ + "ops-local-v0" + ], + "primary_oracle": "schema_contract", + "scorer_contracts": [ + "artifact_exact", + "schema_contract", + "claim_rubric", + "trace_policy", + "mutation_robustness" + ], + "public_cases": true, + "docs_reference": "docs/14-sup01-decision-grade.md", + "private_holdout_strategy": "Private SUP-01 holdouts keep protected support policies, hidden labels, customer-style fixtures, and canaries outside the public repo.", + "mutation_strategy": "Use create_sup01_mutation.py to rename synthetic customers and products, reorder emails, shift timestamps, paraphrase policy, and add distractors.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Decision-grade public pattern for support inbox triage, policy-grounded drafts, and escalations." + }, + "API-01": { + "task_id": "API-01", + "status": "decision-grade", + "introduced_in": "0.5.0", + "current_version": "0.5.0", + "suite_ids": [ + "tools-local-v0" + ], + "primary_oracle": "state_diff", + "scorer_contracts": [ + "artifact_exact", + "schema_contract", + "state_diff", + "trace_policy", + "mutation_robustness" + ], + "public_cases": true, + "docs_reference": "docs/15-api01-decision-grade.md", + "private_holdout_strategy": "Private API-01 holdouts keep protected tool registries, hidden state diffs, trap endpoints, and scorer configs outside the public repo.", + "mutation_strategy": "Use create_api01_mutation.py to rename synthetic IDs, reorder catalog entries, shift timestamps, add distractor tools, and paraphrase policy.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Decision-grade public pattern for local API/tool orchestration with scorer-side state simulation." + }, + "CODE-01": { + "task_id": "CODE-01", + "status": "experimental", + "introduced_in": "0.0.1", + "current_version": "0.1.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "artifact_exact", + "scorer_contracts": [ + "artifact_exact", + "schema_contract" + ], + "public_cases": true, + "docs_reference": "tasks/CODE-01/task.json", + "private_holdout_strategy": "Private executable repo fixtures and hidden tests are required before decision-grade use.", + "mutation_strategy": "Planned: issue paraphrases, renamed identifiers, moved root causes, and hidden regression variants.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Starter scorer only; not decision-grade." + }, + "TERM-02": { + "task_id": "TERM-02", + "status": "experimental", + "introduced_in": "0.0.1", + "current_version": "0.1.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "artifact_exact", + "scorer_contracts": [ + "artifact_exact", + "state_diff" + ], + "public_cases": true, + "docs_reference": "tasks/TERM-02/task.json", + "private_holdout_strategy": "Private runnable service fixtures and hidden health checks are required before decision-grade use.", + "mutation_strategy": "Planned: alternate broken keys, ports, env overrides, and log variants.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Starter scorer only; not decision-grade." + }, + "APP-04": { + "task_id": "APP-04", + "status": "experimental", + "introduced_in": "0.0.1", + "current_version": "0.1.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "state_diff", + "scorer_contracts": [ + "artifact_exact", + "state_diff", + "trace_policy" + ], + "public_cases": true, + "docs_reference": "tasks/APP-04/task.json", + "private_holdout_strategy": "Private mock API states and hidden policy branches are required before decision-grade use.", + "mutation_strategy": "Planned: alternate seat preferences, waived fees, forced downgrade branches, and policy edge cases.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Starter scorer only; not decision-grade." + }, + "SEC-01": { + "task_id": "SEC-01", + "status": "experimental", + "introduced_in": "0.0.1", + "current_version": "0.1.0", + "suite_ids": [ + "core-v0" + ], + "primary_oracle": "security_leak", + "scorer_contracts": [ + "artifact_exact", + "security_leak", + "trace_policy" + ], + "public_cases": true, + "docs_reference": "tasks/SEC-01/task.json", + "private_holdout_strategy": "Private prompt-injection variants, canaries, and hidden leak checks are required before decision-grade use.", + "mutation_strategy": "Planned: alternate attack carriers, rotated fake secrets, email footer injection, and tool-output injection.", + "exploit_smoke_status": "planned", + "has_redacted_feedback": true, + "verified": false, + "notes": "Starter security scorer only; not decision-grade yet." + } + } +} diff --git a/docs/16-benchmark-lifecycle.md b/docs/16-benchmark-lifecycle.md new file mode 100644 index 0000000..8108e8b --- /dev/null +++ b/docs/16-benchmark-lifecycle.md @@ -0,0 +1,85 @@ +# Benchmark Lifecycle + +Agent Bench Lab task families move through explicit lifecycle statuses. The status is about +benchmark readiness, not model quality. + +## Statuses + +| Status | Meaning | Suitable use | +|---|---|---| +| `experimental` | The task exists, but the oracle, fixtures, or tests may still be starter-grade. | Demos, authoring examples, early scorer work | +| `decision-grade` | The task has a deterministic or audited primary oracle, public synthetic examples, tests, mutation strategy, private holdout guidance, normalized scores, and redacted feedback. | Serious comparisons when paired with private holdouts | +| `verified` | The task has passed an additional maintainer audit, scorer loophole review, solvability check, mutation smoke, exploit smoke, and changelog review. | High-confidence repeated evaluation | +| `deprecated` | The task is replaced, flawed, stale, or no longer maintained. | Historical comparison only | + +No task family is `verified` in v0.6. Verification is a later audit level. + +## Experimental + +Experimental means: + +- task metadata exists; +- public examples may exist; +- the scorer may be incomplete or sample-grade; +- hidden checks and mutation coverage may be planned only; +- the task is not suitable for decision-grade comparison. + +Experimental task families can still be useful as templates, but they should not be marketed as +reliable evaluation signals. + +## Decision-Grade + +Decision-grade means: + +- primary oracle is deterministic or audited; +- public synthetic cases exist; +- private holdout strategy is documented; +- mutation strategy is documented; +- scorer output uses normalized score records; +- redacted feedback is supported; +- leak gates pass; +- tests cover pass and fail cases; +- no live dependency is required unless the environment is snapshotted or replayed. + +Decision-grade public cases are still examples and smoke tests. Final comparisons need private +holdouts or protected bundles outside the public repo. + +## Verified + +Verified means all decision-grade criteria are met, plus: + +- maintainer audit completed; +- scorer loophole review completed; +- public cases are solvable; +- mutation smoke passes; +- exploit smoke passes or has an explicit not-applicable justification; +- changelog and version policy are clean; +- known limitations are documented. + +Verification should be conservative. It is better to keep a task family decision-grade than to mark +it verified without an audit trail. + +## Deprecated + +Deprecated means the task family should not be used for new comparisons because it is: + +- replaced by a better task family; +- known to be flawed; +- stale or unsupported; +- incompatible with current scorer contracts. + +Deprecated tasks should keep enough metadata for historical interpretation. + +## Config + +Lifecycle metadata lives in: + +```text +configs/task_lifecycle.json +``` + +Validate it with: + +```bash +make lifecycle-check +``` diff --git a/docs/17-mutation-and-exploit-gates.md b/docs/17-mutation-and-exploit-gates.md new file mode 100644 index 0000000..ef8704b --- /dev/null +++ b/docs/17-mutation-and-exploit-gates.md @@ -0,0 +1,75 @@ +# Mutation And Exploit Gates + +Mutation and exploit gates keep Agent Bench Lab from becoming a collection of brittle public +examples. + +## Mutation Smoke Gates + +Mutation smoke gates test whether a task family can generate safe public-style variants without +committing private holdouts. + +Useful mutations include: + +- reorder inputs; +- rename synthetic entities; +- shift dates while preserving relative logic; +- add distractors; +- paraphrase policy; +- shuffle tool catalogs; +- perturb numeric values while preserving the intended answer logic. + +Mutation smoke does not prove decision-grade performance. It checks that the task family has a +repeatable mutation path and that generated artifacts stay in ignored output paths. + +Run the public mutation gate with: + +```bash +make mutation-smoke +``` + +## Exploit Smoke Gates + +Exploit smoke gates test obvious benchmark loopholes and unsafe shortcuts. + +Examples: + +- extra forbidden file; +- hidden answer leakage attempt; +- scorer-only label in an artifact; +- forbidden endpoint use; +- completion claim without required state; +- unsupported claim with fake citation; +- public report attempting to expose an expected value. + +Public exploit examples should be safe and synthetic. Private exploit checks, canaries, hidden +labels, and protected scorer configs must stay outside the public repo. + +## Canaries + +Canaries are tripwires, not the main defense. + +A canary can show that private content leaked into an agent-visible packet, artifact, trace, or +report. It does not replace: + +- private holdout isolation; +- scorer-only visibility; +- redacted feedback; +- tracked-file leak gates; +- deterministic or audited primary oracles. + +## Gate Declarations + +Hardening gate metadata lives in: + +```text +configs/hardening_gates.json +``` + +Validate it with: + +```bash +make hardening-check +``` + +In v0.6, exploit smoke status is declared per decision-grade family. Full private exploit suites can +be added later without changing public task fixtures. diff --git a/docs/18-suite-strategy.md b/docs/18-suite-strategy.md new file mode 100644 index 0000000..70c878b --- /dev/null +++ b/docs/18-suite-strategy.md @@ -0,0 +1,41 @@ +# Suite Strategy + +Suites are comparison bundles. They should be small enough to run repeatedly and clear enough to +interpret. + +## Core Is Not All Tasks + +`core` is the fast general starter suite. It should not automatically absorb every new +decision-grade task family. + +Core bloat makes routine regression checks slower and less diagnostic. New task families should +propose a suite explicitly. + +## Current Suites + +| Suite | Purpose | Current scope | +|---|---|---| +| `core-v0` | Fast general local regression and smoke comparison | starter task mix plus IF, DATA, DOC | +| `ops-local-v0` | Operational and customer-style workflows | SUP-01 | +| `tools-local-v0` | Local tool/API workflow evaluation | API-01 | + +## Future Suites + +Future suites may include: + +- `dev-local` for repository and terminal work; +- `security-local` for prompt injection, leakage, and policy tasks; +- `research-local` for fixed-corpus source-grounded research; +- `browser-replay` for browser tasks over frozen snapshots; +- `weekly-deep` for slower, broader regression runs. + +## Rule + +Every new task family should answer: + +```text +Which suite owns this task, and why? +``` + +If the answer is "core", the task should be fast, broadly useful, and worth running in most local +regression loops. diff --git a/docs/19-report-schema-v1.md b/docs/19-report-schema-v1.md new file mode 100644 index 0000000..b66d2e0 --- /dev/null +++ b/docs/19-report-schema-v1.md @@ -0,0 +1,58 @@ +# Report Schema V1 Guidance + +Report schema v1 is guidance for future generated reports. It is not a runtime rewrite in v0.6. + +Reports should make comparisons useful without exposing scorer-only or private evaluation content. + +## Recommended Fields + +| Field | Meaning | +|---|---| +| `run_id` | Stable run identifier | +| `suite_id` | Suite used for the run | +| `task_id` | Task family identifier | +| `task_version` | Task version from task metadata | +| `task_status` | Task implementation status from task metadata | +| `lifecycle_status` | Lifecycle status from `configs/task_lifecycle.json` | +| `score` | Normalized score from 0 to 1 | +| `success` | Boolean pass/fail result | +| `pass_threshold` | Threshold used for success | +| `cost` | Cost field or explicit null | +| `latency` | Runtime latency field or explicit null | +| `tool_calls` | Tool-call count or summary | +| `model_calls` | Model-call count or summary | +| `policy_violations` | Redacted policy violation categories | +| `leak_flags` | Redacted leak or canary categories | +| `mutation_score` | Optional mutation robustness score | +| `exploit_smoke_flags` | Optional exploit gate categories | +| `redaction_applied` | Whether public feedback was redacted | +| `private_bundle_ref_hash` | Optional hash reference, never raw private data | +| `scorer_contracts` | Scorer contracts used by the task | +| `diagnostics_redacted` | Public-safe diagnostic text | + +## Rules + +- Do not include raw private fixtures. +- Do not include hidden labels or answer keys. +- Do not include protected scorer configs. +- Do not include exact hidden thresholds. +- Do not include raw canary strings. +- Do not include raw private traces in public reports. +- Prefer redacted component-level diagnostics. + +## Missing Data + +Missing cost, latency, or tool-call data should be explicit. Do not invent fields that were not +captured. + +## Private Bundle References + +Private bundle references should be hashes or opaque IDs only: + +```text +private_bundle_ref_hash +fixture_hash +scorer_config_hash +``` + +Reports may say that a private bundle was used. They must not reveal bundle contents. diff --git a/docs/README.md b/docs/README.md index 7668ebc..d8c06cb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,6 +21,10 @@ Start here: 17. [DOC-01 decision-grade pattern](13-doc01-decision-grade.md) 18. [SUP-01 decision-grade pattern](14-sup01-decision-grade.md) 19. [API-01 decision-grade pattern](15-api01-decision-grade.md) -20. [v0 roadmap](roadmap-v0.md) -21. [Public release checklist](public-release-checklist.md) -22. [Decision log template](decision-log-template.md) +20. [Benchmark lifecycle](16-benchmark-lifecycle.md) +21. [Mutation and exploit gates](17-mutation-and-exploit-gates.md) +22. [Suite strategy](18-suite-strategy.md) +23. [Report schema v1 guidance](19-report-schema-v1.md) +24. [v0 roadmap](roadmap-v0.md) +25. [Public release checklist](public-release-checklist.md) +26. [Decision log template](decision-log-template.md) diff --git a/scripts/check_hardening_gates.py b/scripts/check_hardening_gates.py new file mode 100644 index 0000000..364bbad --- /dev/null +++ b/scripts/check_hardening_gates.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import argparse +import json +import subprocess +from pathlib import Path + +VALID_EXPLOIT_STATUSES = {"implemented", "planned", "not_applicable"} + + +def load_json(path: Path) -> dict: + with path.open(encoding="utf-8") as handle: + return json.load(handle) + + +def decision_grade_tasks(root: Path) -> set[str]: + lifecycle = load_json(root / "configs" / "task_lifecycle.json") + return { + task_id + for task_id, entry in lifecycle.get("tasks", {}).items() + if entry.get("status") in {"decision-grade", "verified"} + } + + +def tracked_files(root: Path) -> set[str]: + try: + result = subprocess.run( + ["git", "ls-files"], + cwd=root, + check=True, + capture_output=True, + text=True, + ) + except (OSError, subprocess.CalledProcessError): + return set() + return {line.strip() for line in result.stdout.splitlines() if line.strip()} + + +def check_hardening_gates(root: Path) -> list[str]: + config = load_json(root / "configs" / "hardening_gates.json") + entries = config.get("tasks", {}) + errors: list[str] = [] + + if not isinstance(entries, dict): + return ["configs/hardening_gates.json: tasks must be an object"] + + expected_tasks = decision_grade_tasks(root) + configured_tasks = set(entries) + + for missing in sorted(expected_tasks - configured_tasks): + errors.append(f"{missing}: missing hardening gate entry") + for extra in sorted(configured_tasks - expected_tasks): + errors.append(f"{extra}: hardening gate entry is only expected for decision-grade tasks") + + tracked = tracked_files(root) + generated_tracked = sorted(path for path in tracked if path.startswith("artifacts/")) + if generated_tracked: + errors.append(f"generated mutation output is tracked: {', '.join(generated_tracked)}") + + for task_id, entry in sorted(entries.items()): + if entry.get("task_id") != task_id: + errors.append(f"{task_id}: task_id must match config key") + + mutation_required = entry.get("mutation_smoke_required") is True + mutation_script = entry.get("mutation_script") + if mutation_required: + if not isinstance(mutation_script, str) or not (root / mutation_script).exists(): + errors.append(f"{task_id}: required mutation script does not exist") + expected_files = entry.get("expected_output_files") + if not isinstance(expected_files, list) or not expected_files: + errors.append(f"{task_id}: mutation smoke requires expected_output_files") + + mutation_output = entry.get("mutation_output") + if not isinstance(mutation_output, str) or not mutation_output.startswith("artifacts/"): + errors.append(f"{task_id}: mutation_output must be under artifacts/") + elif mutation_output in tracked: + errors.append(f"{task_id}: mutation_output path is tracked") + + exploit_status = entry.get("exploit_smoke_status") + if exploit_status not in VALID_EXPLOIT_STATUSES: + errors.append(f"{task_id}: invalid exploit_smoke_status {exploit_status!r}") + if exploit_status in {"planned", "not_applicable"}: + reason = entry.get("reason") + if not isinstance(reason, str) or not reason.strip(): + errors.append(f"{task_id}: {exploit_status} exploit status requires a reason") + + if entry.get("public_safe") is not True: + errors.append(f"{task_id}: public_safe must be true") + + return errors + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate hardening gate declarations.") + parser.add_argument("--root", type=Path, default=Path(__file__).resolve().parents[1]) + args = parser.parse_args() + + errors = check_hardening_gates(args.root.resolve()) + if errors: + for error in errors: + print(f"ERROR: {error}") + return 1 + + print("Hardening gate check passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_lifecycle.py b/scripts/check_lifecycle.py new file mode 100644 index 0000000..c2434c2 --- /dev/null +++ b/scripts/check_lifecycle.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +VALID_STATUSES = {"experimental", "decision-grade", "verified", "deprecated"} +DECISION_READY_STATUSES = {"decision-grade", "verified"} + + +def load_json(path: Path) -> dict: + with path.open(encoding="utf-8") as handle: + return json.load(handle) + + +def task_dirs(root: Path) -> set[str]: + return { + path.name + for path in (root / "tasks").iterdir() + if path.is_dir() and (path / "task.json").exists() + } + + +def suite_ids(root: Path) -> set[str]: + ids: set[str] = set() + for path in sorted((root / "configs" / "suites").glob("*.json")): + data = load_json(path) + suite_id = data.get("suite_id") + if isinstance(suite_id, str): + ids.add(suite_id) + return ids + + +def require_text(entry: dict, field: str, errors: list[str]) -> None: + value = entry.get(field) + if not isinstance(value, str) or not value.strip(): + errors.append(f"{entry.get('task_id')}: missing non-empty {field}") + + +def require_list(entry: dict, field: str, errors: list[str]) -> None: + value = entry.get(field) + if not isinstance(value, list) or not value: + errors.append(f"{entry.get('task_id')}: missing non-empty {field}") + + +def check_decision_grade(root: Path, task_id: str, entry: dict, errors: list[str]) -> None: + task_dir = root / "tasks" / task_id + for filename in ("task.json", "prompt.md", "scorer.py"): + if not (task_dir / filename).exists(): + errors.append(f"{task_id}: missing tasks/{task_id}/{filename}") + + fixture_dir = root / "fixtures" / "public" / task_id + if not fixture_dir.exists(): + errors.append(f"{task_id}: missing public fixture directory") + + docs_reference = entry.get("docs_reference") + if not isinstance(docs_reference, str) or not (root / docs_reference).exists(): + errors.append(f"{task_id}: docs_reference does not exist") + + require_text(entry, "private_holdout_strategy", errors) + require_text(entry, "mutation_strategy", errors) + require_text(entry, "primary_oracle", errors) + require_list(entry, "scorer_contracts", errors) + + if entry.get("public_cases") is not True: + errors.append(f"{task_id}: decision-grade task must declare public_cases true") + if entry.get("has_redacted_feedback") is not True: + errors.append(f"{task_id}: decision-grade task must declare redacted feedback") + if not entry.get("exploit_smoke_status"): + errors.append(f"{task_id}: missing exploit_smoke_status") + + +def check_lifecycle(root: Path) -> list[str]: + config_path = root / "configs" / "task_lifecycle.json" + config = load_json(config_path) + entries = config.get("tasks", {}) + errors: list[str] = [] + + if not isinstance(entries, dict): + return ["configs/task_lifecycle.json: tasks must be an object"] + + actual_tasks = task_dirs(root) + configured_tasks = set(entries) + valid_suite_ids = suite_ids(root) + + for missing in sorted(actual_tasks - configured_tasks): + errors.append(f"{missing}: missing lifecycle entry") + for extra in sorted(configured_tasks - actual_tasks): + errors.append(f"{extra}: lifecycle entry has no matching task directory") + + for task_id, entry in sorted(entries.items()): + if not isinstance(entry, dict): + errors.append(f"{task_id}: lifecycle entry must be an object") + continue + if entry.get("task_id") != task_id: + errors.append(f"{task_id}: task_id must match config key") + + status = entry.get("status") + if status not in VALID_STATUSES: + errors.append(f"{task_id}: invalid status {status!r}") + continue + + require_text(entry, "introduced_in", errors) + require_text(entry, "current_version", errors) + require_text(entry, "primary_oracle", errors) + require_list(entry, "suite_ids", errors) + + for suite_id in entry.get("suite_ids", []): + if suite_id not in valid_suite_ids: + errors.append(f"{task_id}: unknown suite_id {suite_id}") + + if status in DECISION_READY_STATUSES: + check_decision_grade(root, task_id, entry, errors) + + verified = entry.get("verified") + if status == "verified" and verified is not True: + errors.append(f"{task_id}: verified status requires verified=true") + if status != "verified" and verified is True: + errors.append(f"{task_id}: verified=true is only allowed for verified tasks") + + return errors + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate task-family lifecycle metadata.") + parser.add_argument("--root", type=Path, default=Path(__file__).resolve().parents[1]) + args = parser.parse_args() + + errors = check_lifecycle(args.root.resolve()) + if errors: + for error in errors: + print(f"ERROR: {error}") + return 1 + + print("Lifecycle check passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_mutation_smoke.py b/scripts/run_mutation_smoke.py new file mode 100644 index 0000000..0260eb5 --- /dev/null +++ b/scripts/run_mutation_smoke.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import argparse +import json +import shutil +import subprocess +import sys +from pathlib import Path + + +def load_json(path: Path) -> dict: + with path.open(encoding="utf-8") as handle: + return json.load(handle) + + +def selected_entries(root: Path, task_id: str | None) -> list[tuple[str, dict]]: + config = load_json(root / "configs" / "hardening_gates.json") + entries = config.get("tasks", {}) + selected = [] + for current_task_id, entry in sorted(entries.items()): + if task_id and current_task_id != task_id: + continue + if entry.get("mutation_smoke_required") is True: + selected.append((current_task_id, entry)) + return selected + + +def output_dir(root: Path, out_root: Path | None, task_id: str, entry: dict) -> Path: + if out_root is not None: + return out_root / task_id / "case_mutation_001" + return root / entry["mutation_output"] + + +def run_mutation(root: Path, task_id: str, entry: dict, out_root: Path | None) -> None: + script = root / entry["mutation_script"] + output = output_dir(root, out_root, task_id, entry) + if output.exists(): + shutil.rmtree(output) + + subprocess.run( + [sys.executable, str(script), "--out", str(output)], + cwd=root, + check=True, + ) + + if not output.exists(): + raise RuntimeError(f"{task_id}: mutation output was not created: {output}") + for rel_path in entry.get("expected_output_files", []): + expected = output / rel_path + if not expected.exists(): + raise RuntimeError(f"{task_id}: expected mutation output missing: {expected}") + + print(f"{task_id}: mutation smoke output ok at {output}") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run public mutation smoke generators.") + parser.add_argument("--root", type=Path, default=Path(__file__).resolve().parents[1]) + parser.add_argument("--task", help="Run one task family only.") + parser.add_argument( + "--out-root", + type=Path, + help="Override output root. Defaults to each task mutation_output config.", + ) + args = parser.parse_args() + + root = args.root.resolve() + out_root = args.out_root.resolve() if args.out_root else None + entries = selected_entries(root, args.task) + if not entries: + print("No mutation smoke entries selected.") + return 1 + + for task_id, entry in entries: + run_mutation(root, task_id, entry, out_root) + + print("Mutation smoke passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_lifecycle_hardening.py b/tests/test_lifecycle_hardening.py new file mode 100644 index 0000000..9ca1060 --- /dev/null +++ b/tests/test_lifecycle_hardening.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +def run_script(*args: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, *args], + cwd=ROOT, + capture_output=True, + text=True, + check=False, + ) + + +def test_lifecycle_check_passes(): + result = run_script("scripts/check_lifecycle.py") + + assert result.returncode == 0, result.stdout + result.stderr + assert "Lifecycle check passed" in result.stdout + + +def test_hardening_gate_check_passes(): + result = run_script("scripts/check_hardening_gates.py") + + assert result.returncode == 0, result.stdout + result.stderr + assert "Hardening gate check passed" in result.stdout + + +def test_mutation_smoke_writes_to_supplied_output_root(tmp_path): + result = run_script("scripts/run_mutation_smoke.py", "--out-root", str(tmp_path)) + + assert result.returncode == 0, result.stdout + result.stderr + for task_id in ("IF-01", "DATA-01", "DOC-01", "SUP-01", "API-01"): + assert (tmp_path / task_id / "case_mutation_001" / "check_config.json").exists() + + +def test_lifecycle_marks_no_task_verified(): + data = json.loads((ROOT / "configs" / "task_lifecycle.json").read_text(encoding="utf-8")) + + assert all(not entry["verified"] for entry in data["tasks"].values()) + assert all(entry["status"] != "verified" for entry in data["tasks"].values()) + + +def test_hardening_gates_cover_decision_grade_tasks_only(): + lifecycle = json.loads((ROOT / "configs" / "task_lifecycle.json").read_text(encoding="utf-8")) + gates = json.loads((ROOT / "configs" / "hardening_gates.json").read_text(encoding="utf-8")) + decision_grade = { + task_id + for task_id, entry in lifecycle["tasks"].items() + if entry["status"] in {"decision-grade", "verified"} + } + + assert set(gates["tasks"]) == decision_grade