Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ jobs:
- name: Run API-01 smoke
run: make api01-smoke

- name: Run lifecycle check
run: make lifecycle-check

- name: Run mutation smoke
run: make mutation-smoke

- name: Run hardening check
run: make hardening-check

- name: Run leak check
run: make leak-check

Expand Down
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PYTHON ?= python3
PYTHONPATH ?= src

.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke leak-check test
.PHONY: validate list smoke compare-smoke if01-smoke data01-smoke doc01-smoke sup01-smoke api01-smoke lifecycle-check mutation-smoke hardening-check leak-check test

validate:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m agent_bench_lab.cli validate
Expand Down Expand Up @@ -58,6 +58,15 @@ api01-smoke:
$(PYTHON) scripts/create_api01_mutation.py --out artifacts/mutations/API-01/case_mutation_001
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m pytest -q tests/test_api01.py

lifecycle-check:
$(PYTHON) scripts/check_lifecycle.py

mutation-smoke:
$(PYTHON) scripts/run_mutation_smoke.py

hardening-check:
$(PYTHON) scripts/check_hardening_gates.py

leak-check:
$(PYTHON) scripts/public_leak_check.py .

Expand Down
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@ The Private Eval Layer holds hidden labels, private holdouts, answer keys, prote

See [Private Eval Layer](docs/private-eval-layer.md), [Scorer type contracts](docs/scorer-types.md), and [Reporting and feedback](docs/reporting-and-feedback.md).

## Benchmark lifecycle and hardening gates

After the first five decision-grade public patterns, v0.6 adds standard-layer gates instead of another task family.

Lifecycle metadata declares whether each task family is `experimental`, `decision-grade`, `verified`, or `deprecated`. Hardening metadata declares mutation smoke scripts and exploit smoke status for decision-grade families. No task is marked `verified` yet.

```bash
make lifecycle-check
make mutation-smoke
make hardening-check
```

See [Benchmark lifecycle](docs/16-benchmark-lifecycle.md), [Mutation and exploit gates](docs/17-mutation-and-exploit-gates.md), [Suite strategy](docs/18-suite-strategy.md), and [Report schema v1 guidance](docs/19-report-schema-v1.md).

## Current status

This repository is a **v0 public starter**. It contains:
Expand All @@ -67,7 +81,7 @@ This repository is a **v0 public starter**. It contains:
- minimal Python CLI scaffolding;
- sample public fixtures;
- sample scorers plus hardened IF-01, DATA-01, DOC-01, SUP-01, and API-01 artifact/state-based scorers;
- documentation for benchmark design, metrics, and anti-overfitting.
- documentation for benchmark design, metrics, anti-overfitting, lifecycle status, and hardening gates.

It intentionally does **not** contain private holdout tasks, production secrets, personal data, or benchmark answers for real evaluation runs.

Expand Down Expand Up @@ -140,6 +154,9 @@ Without installing the package, use the source-tree Make targets:
make validate
make test
make smoke
make lifecycle-check
make mutation-smoke
make hardening-check
make leak-check
```

Expand Down
86 changes: 86 additions & 0 deletions configs/hardening_gates.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"version": "0.6.0",
"schema_version": 1,
"tasks": {
"IF-01": {
"task_id": "IF-01",
"mutation_smoke_required": true,
"mutation_script": "scripts/create_if01_mutation.py",
"mutation_output": "artifacts/mutation-smoke/IF-01/case_mutation_001",
"expected_output_files": [
"spec.md",
"check_config.json"
],
"exploit_smoke_required": true,
"exploit_smoke_status": "planned",
"reason": "IF-01 already tests extra files, forbidden sections, and banned phrases; v0.6 records the gate and future standard exploit smoke shape.",
"public_safe": true
},
"DATA-01": {
"task_id": "DATA-01",
"mutation_smoke_required": true,
"mutation_script": "scripts/create_data01_mutation.py",
"mutation_output": "artifacts/mutation-smoke/DATA-01/case_mutation_001",
"expected_output_files": [
"spec.md",
"check_config.json",
"data/events.csv",
"data/customers.csv",
"data/analytics.db"
],
"exploit_smoke_required": true,
"exploit_smoke_status": "planned",
"reason": "DATA-01 already checks unsupported metrics and invalid artifacts; v0.6 keeps exploit gate status explicit without adding private data.",
"public_safe": true
},
"DOC-01": {
"task_id": "DOC-01",
"mutation_smoke_required": true,
"mutation_script": "scripts/create_doc01_mutation.py",
"mutation_output": "artifacts/mutation-smoke/DOC-01/case_mutation_001",
"expected_output_files": [
"spec.md",
"check_config.json",
"corpus/product_policy.md"
],
"exploit_smoke_required": true,
"exploit_smoke_status": "planned",
"reason": "DOC-01 already checks unsupported claims, stale sources, and citation evidence; standard exploit smoke remains a declared next hardening layer.",
"public_safe": true
},
"SUP-01": {
"task_id": "SUP-01",
"mutation_smoke_required": true,
"mutation_script": "scripts/create_sup01_mutation.py",
"mutation_output": "artifacts/mutation-smoke/SUP-01/case_mutation_001",
"expected_output_files": [
"spec.md",
"check_config.json",
"policy.md",
"customer_profile.json",
"inbox/email_001.eml"
],
"exploit_smoke_required": true,
"exploit_smoke_status": "planned",
"reason": "SUP-01 already checks prohibited promises and scorer-only labels; private prompt-injection and canary gates stay outside the public repo.",
"public_safe": true
},
"API-01": {
"task_id": "API-01",
"mutation_smoke_required": true,
"mutation_script": "scripts/create_api01_mutation.py",
"mutation_output": "artifacts/mutation-smoke/API-01/case_mutation_001",
"expected_output_files": [
"spec.md",
"check_config.json",
"api_catalog.json",
"api_state.json",
"policy.md"
],
"exploit_smoke_required": true,
"exploit_smoke_status": "planned",
"reason": "API-01 already checks forbidden endpoints and wrong state mutations; future private gates can add trap endpoints and canaries.",
"public_safe": true
}
}
}
224 changes: 224 additions & 0 deletions configs/task_lifecycle.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
{
"version": "0.6.0",
"schema_version": 1,
"statuses": [
"experimental",
"decision-grade",
"verified",
"deprecated"
],
"tasks": {
"IF-01": {
"task_id": "IF-01",
"status": "decision-grade",
"introduced_in": "0.1.0",
"current_version": "0.1.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "artifact_exact",
"scorer_contracts": [
"artifact_exact",
"schema_contract",
"mutation_robustness"
],
"public_cases": true,
"docs_reference": "docs/11-if01-decision-grade.md",
"private_holdout_strategy": "Private IF-01 holdouts keep hidden contract variants, answer constraints, and protected scorer configs outside the public repo.",
"mutation_strategy": "Use create_if01_mutation.py to reorder constraints, change synthetic names, adjust limits, and vary harmless wording.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "First decision-grade public pattern for strict instruction following and artifact contracts."
},
"DATA-01": {
"task_id": "DATA-01",
"status": "decision-grade",
"introduced_in": "0.2.0",
"current_version": "0.2.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "numeric_metric",
"scorer_contracts": [
"artifact_exact",
"schema_contract",
"numeric_metric",
"claim_rubric",
"mutation_robustness"
],
"public_cases": true,
"docs_reference": "docs/12-data01-decision-grade.md",
"private_holdout_strategy": "Private DATA-01 holdouts keep synthetic or customer-scoped data seeds, expected metrics, honey rows, and scorer configs outside the public repo.",
"mutation_strategy": "Use create_data01_mutation.py to alter numeric values, shift dates, reorder rows, rename categories, and add distractors.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Decision-grade public pattern for exact data work, factual memos, and chart specifications."
},
"DOC-01": {
"task_id": "DOC-01",
"status": "decision-grade",
"introduced_in": "0.3.0",
"current_version": "0.3.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "claim_rubric",
"scorer_contracts": [
"artifact_exact",
"schema_contract",
"claim_rubric",
"mutation_robustness"
],
"public_cases": true,
"docs_reference": "docs/13-doc01-decision-grade.md",
"private_holdout_strategy": "Private DOC-01 holdouts keep hidden corpora, expected claim labels, citation rubrics, and canaries outside the public repo.",
"mutation_strategy": "Use create_doc01_mutation.py to rename synthetic entities, reorder documents, paraphrase wording, shift dates, and add distractors.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Decision-grade public pattern for fixed-corpus grounded answers and citation checks."
},
"SUP-01": {
"task_id": "SUP-01",
"status": "decision-grade",
"introduced_in": "0.4.0",
"current_version": "0.4.0",
"suite_ids": [
"ops-local-v0"
],
"primary_oracle": "schema_contract",
"scorer_contracts": [
"artifact_exact",
"schema_contract",
"claim_rubric",
"trace_policy",
"mutation_robustness"
],
"public_cases": true,
"docs_reference": "docs/14-sup01-decision-grade.md",
"private_holdout_strategy": "Private SUP-01 holdouts keep protected support policies, hidden labels, customer-style fixtures, and canaries outside the public repo.",
"mutation_strategy": "Use create_sup01_mutation.py to rename synthetic customers and products, reorder emails, shift timestamps, paraphrase policy, and add distractors.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Decision-grade public pattern for support inbox triage, policy-grounded drafts, and escalations."
},
"API-01": {
"task_id": "API-01",
"status": "decision-grade",
"introduced_in": "0.5.0",
"current_version": "0.5.0",
"suite_ids": [
"tools-local-v0"
],
"primary_oracle": "state_diff",
"scorer_contracts": [
"artifact_exact",
"schema_contract",
"state_diff",
"trace_policy",
"mutation_robustness"
],
"public_cases": true,
"docs_reference": "docs/15-api01-decision-grade.md",
"private_holdout_strategy": "Private API-01 holdouts keep protected tool registries, hidden state diffs, trap endpoints, and scorer configs outside the public repo.",
"mutation_strategy": "Use create_api01_mutation.py to rename synthetic IDs, reorder catalog entries, shift timestamps, add distractor tools, and paraphrase policy.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Decision-grade public pattern for local API/tool orchestration with scorer-side state simulation."
},
"CODE-01": {
"task_id": "CODE-01",
"status": "experimental",
"introduced_in": "0.0.1",
"current_version": "0.1.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "artifact_exact",
"scorer_contracts": [
"artifact_exact",
"schema_contract"
],
"public_cases": true,
"docs_reference": "tasks/CODE-01/task.json",
"private_holdout_strategy": "Private executable repo fixtures and hidden tests are required before decision-grade use.",
"mutation_strategy": "Planned: issue paraphrases, renamed identifiers, moved root causes, and hidden regression variants.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Starter scorer only; not decision-grade."
},
"TERM-02": {
"task_id": "TERM-02",
"status": "experimental",
"introduced_in": "0.0.1",
"current_version": "0.1.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "artifact_exact",
"scorer_contracts": [
"artifact_exact",
"state_diff"
],
"public_cases": true,
"docs_reference": "tasks/TERM-02/task.json",
"private_holdout_strategy": "Private runnable service fixtures and hidden health checks are required before decision-grade use.",
"mutation_strategy": "Planned: alternate broken keys, ports, env overrides, and log variants.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Starter scorer only; not decision-grade."
},
"APP-04": {
"task_id": "APP-04",
"status": "experimental",
"introduced_in": "0.0.1",
"current_version": "0.1.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "state_diff",
"scorer_contracts": [
"artifact_exact",
"state_diff",
"trace_policy"
],
"public_cases": true,
"docs_reference": "tasks/APP-04/task.json",
"private_holdout_strategy": "Private mock API states and hidden policy branches are required before decision-grade use.",
"mutation_strategy": "Planned: alternate seat preferences, waived fees, forced downgrade branches, and policy edge cases.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Starter scorer only; not decision-grade."
},
"SEC-01": {
"task_id": "SEC-01",
"status": "experimental",
"introduced_in": "0.0.1",
"current_version": "0.1.0",
"suite_ids": [
"core-v0"
],
"primary_oracle": "security_leak",
"scorer_contracts": [
"artifact_exact",
"security_leak",
"trace_policy"
],
"public_cases": true,
"docs_reference": "tasks/SEC-01/task.json",
"private_holdout_strategy": "Private prompt-injection variants, canaries, and hidden leak checks are required before decision-grade use.",
"mutation_strategy": "Planned: alternate attack carriers, rotated fake secrets, email footer injection, and tool-output injection.",
"exploit_smoke_status": "planned",
"has_redacted_feedback": true,
"verified": false,
"notes": "Starter security scorer only; not decision-grade yet."
}
}
}
Loading
Loading