diff --git a/docs/reporting-and-feedback.md b/docs/reporting-and-feedback.md index 3414941..6f25b84 100644 --- a/docs/reporting-and-feedback.md +++ b/docs/reporting-and-feedback.md @@ -73,3 +73,16 @@ Redacted feedback should: Scorer-private reports can be more specific, but they must stay outside public repositories and outside agent-visible context. + +## Lightweight Public Gate + +The repository includes a small redaction utility for public-facing generated reports. + +This utility is a safety gate, not a private evaluation implementation and not a full data-loss +prevention system. It catches obvious scorer-only strings such as answer-key hints, hidden labels, +private thresholds, canary identifiers, raw traces, and protected scorer config references before +they appear in public Markdown or CSV reports. + +Private scorer-only content should still be isolated at the source. Do not pass hidden labels, +answer keys, private rubrics, customer data, or protected scorer configs into public reports and +then rely on redaction to clean them up. diff --git a/scripts/public_leak_check.py b/scripts/public_leak_check.py index e0e5a59..7f42ef8 100644 --- a/scripts/public_leak_check.py +++ b/scripts/public_leak_check.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import subprocess import sys from pathlib import Path @@ -15,18 +16,110 @@ "sk-public-test-DO-NOT-LEAK", } -SKIP_DIRS = {".git", ".venv", "venv", "__pycache__", "private", "runs", "traces", "artifacts", "build", "dist"} +SKIP_CONTENT_DIRS = { + ".git", + ".venv", + "venv", + "__pycache__", + "private", + "runs", + "traces", + "artifacts", + "build", + "dist", +} +SKIP_WALK_DIRS = {".git", ".venv", "venv", "__pycache__", "build", "dist"} +SKIP_WALK_PREFIXES = { + "runs/", + "traces/", + "artifacts/", + "examples/artifacts/", + "reports/generated/", +} TEXT_SUFFIXES = {".md", ".py", ".json", ".txt", ".csv", ".html", ".env", ".toml", ".yaml", ".yml", ".gitignore", ""} +DENIED_PATH_PREFIXES = { + "runs/", + "traces/", + "private/", + "fixtures/private/", + "examples/artifacts/", + "artifacts/", + "reports/generated/", +} +DENIED_PATH_PARTS = {"runs", "traces", "private", "artifacts"} +DENIED_PATH_TERMS = { + ".env", + ".env.", + "secret", + "token", + "key", + "answer_key", + "hidden_label", + "customer_private", +} -def should_skip(path: Path) -> bool: - return any(part in SKIP_DIRS for part in path.parts) +def should_skip_content(path: Path) -> bool: + return any(part in SKIP_CONTENT_DIRS for part in path.parts) -def scan(root: Path) -> list[str]: - findings = [] + +def _relative_path(path: Path) -> str: + return path.as_posix().removeprefix("./") + + +def denylisted_path_reason(path: str) -> str | None: + normalized = _relative_path(Path(path)).lower() + parts = normalized.split("/") + for prefix in DENIED_PATH_PREFIXES: + if normalized.startswith(prefix): + return f"denied path prefix {prefix}" + for part in parts: + if part in DENIED_PATH_PARTS: + return f"denied path component {part}" + if part == ".env" or part.startswith(".env."): + return "denied env file path" + for term in DENIED_PATH_TERMS: + if term in normalized: + return f"denied path term {term}" + return None + + +def git_tracked_paths(root: Path) -> list[str] | None: + try: + result = subprocess.run( + ["git", "-C", str(root), "ls-files", "-z"], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + return None + return [item.decode("utf-8") for item in result.stdout.split(b"\0") if item] + + +def walk_source_paths(root: Path) -> list[str]: + paths = [] for path in root.rglob("*"): - if path.is_dir() or should_skip(path): + rel = path.relative_to(root) + rel_name = rel.as_posix() + if any(part in SKIP_WALK_DIRS for part in rel.parts): + continue + if any(rel_name.startswith(prefix) for prefix in SKIP_WALK_PREFIXES): + continue + if path.is_file(): + paths.append(rel_name) + return sorted(paths) + + +def scan_paths(root: Path, paths: list[str]) -> list[str]: + findings = [] + for rel_path in sorted(paths): + reason = denylisted_path_reason(rel_path) + if reason: + findings.append(f"{rel_path}: {reason}") + path = root / rel_path + if not path.is_file() or should_skip_content(path): continue if path.suffix not in TEXT_SUFFIXES and path.name != ".gitignore": continue @@ -46,6 +139,12 @@ def scan(root: Path) -> list[str]: return findings +def scan(root: Path) -> list[str]: + tracked_paths = git_tracked_paths(root) + paths = tracked_paths if tracked_paths is not None else walk_source_paths(root) + return scan_paths(root, paths) + + if __name__ == "__main__": root = Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve() findings = scan(root) diff --git a/src/agent_bench_lab/compare.py b/src/agent_bench_lab/compare.py index 4521a0e..a78dbc1 100644 --- a/src/agent_bench_lab/compare.py +++ b/src/agent_bench_lab/compare.py @@ -5,6 +5,8 @@ from pathlib import Path from typing import Any +from .redaction import redact_text + EPSILON = 1e-9 @@ -134,9 +136,13 @@ def _format_number(value: float | None, *, signed: bool = False) -> str: return f"{value:.3f}" +def _safe_cell(value: Any) -> str: + return redact_text(str(value)) + + def _format_row_item(row: dict[str, Any]) -> str: delta = _format_number(row["delta"], signed=True) - return f"- {row['task_id']}/{row['case_id']}: {delta}" + return f"- {_safe_cell(row['task_id'])}/{_safe_cell(row['case_id'])}: {delta}" def render_markdown_report(result: dict[str, Any], title: str = "Compare") -> str: @@ -173,12 +179,20 @@ def render_markdown_report(result: dict[str, Any], title: str = "Compare") -> st lines.append("- none") lines.extend(["", "## Policy Violations"]) for item in result["policy_violations"][:20]: - lines.append(f"- {item['side']} {item['task_id']}/{item['case_id']}: {item['violation']}") + lines.append( + "- " + f"{_safe_cell(item['side'])} " + f"{_safe_cell(item['task_id'])}/{_safe_cell(item['case_id'])}: " + f"{_safe_cell(item['violation'])}" + ) if not result["policy_violations"]: lines.append("- none") lines.extend(["", "## Missing Scores"]) for row in result["missing_scores"][:20]: - lines.append(f"- {row['task_id']}/{row['case_id']}: {row['status']}") + lines.append( + f"- {_safe_cell(row['task_id'])}/{_safe_cell(row['case_id'])}: " + f"{_safe_cell(row['status'])}" + ) if not result["missing_scores"]: lines.append("- none") lines.extend( @@ -193,14 +207,14 @@ def render_markdown_report(result: dict[str, Any], title: str = "Compare") -> st for row in result["rows"]: lines.append( "| " - f"{row['task_id']} | " - f"{row['case_id']} | " + f"{_safe_cell(row['task_id'])} | " + f"{_safe_cell(row['case_id'])} | " f"{_format_number(row['baseline_score'])} | " f"{_format_number(row['candidate_score'])} | " f"{_format_number(row['delta'], signed=True)} | " f"{row['baseline_success']} | " f"{row['candidate_success']} | " - f"{row['status']} |" + f"{_safe_cell(row['status'])} |" ) lines.extend( [ @@ -231,4 +245,16 @@ def write_csv_report(result: dict[str, Any], output_path: Path) -> None: ], ) writer.writeheader() - writer.writerows(result["rows"]) + for row in result["rows"]: + writer.writerow( + { + "task_id": _safe_cell(row["task_id"]), + "case_id": _safe_cell(row["case_id"]), + "baseline_score": row["baseline_score"], + "candidate_score": row["candidate_score"], + "delta": row["delta"], + "baseline_success": row["baseline_success"], + "candidate_success": row["candidate_success"], + "status": _safe_cell(row["status"]), + } + ) diff --git a/src/agent_bench_lab/redaction.py b/src/agent_bench_lab/redaction.py new file mode 100644 index 0000000..d54075f --- /dev/null +++ b/src/agent_bench_lab/redaction.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import re +from collections.abc import Mapping, Sequence +from typing import Any + +REDACTED = "[REDACTED]" +REDACTED_KEY = "[REDACTED_KEY]" + +UNSAFE_TEXT_PATTERNS = [ + re.compile(pattern, re.IGNORECASE) + for pattern in [ + r"\banswer[_ -]?key\b", + r"\bhidden[_ -]?label(s)?\b", + r"\bprivate[_ -]?threshold\b", + r"\bprotected[_ -]?scorer[_ -]?config\b", + r"\bscorer[_ -]?config\b", + r"\bcanary\b", + r"\bCANARY_", + r"\bHONEY_", + r"\bhoney row\b", + r"\bsecret\b", + r"\btoken\b", + r"\bapi[_ -]?key\b", + r"\bexpected\s*=", + r"\bexpected\s*:", + r"\bcorrect answer\b", + r"\bprivate rubric\b", + r"\bcustomer[_ -]?private\b", + r"fixtures/private", + r"(^|/)private/", + r"\braw[_ -]?trace\b", + r"\braw[_ -]?diagnostics\b", + ] +] + + +def is_public_safe_text(text: str) -> bool: + return not any(pattern.search(text) for pattern in UNSAFE_TEXT_PATTERNS) + + +def redact_text(text: str) -> str: + if is_public_safe_text(text): + return text + return REDACTED + + +def redact_obj(obj: Any) -> Any: + if isinstance(obj, str): + return redact_text(obj) + if isinstance(obj, Mapping): + redacted: dict[str, Any] = {} + for key, value in obj.items(): + safe_key = str(key) if is_public_safe_text(str(key)) else REDACTED_KEY + redacted[safe_key] = redact_obj(value) + return redacted + if isinstance(obj, tuple): + return tuple(redact_obj(item) for item in obj) + if isinstance(obj, Sequence) and not isinstance(obj, bytes | bytearray): + return [redact_obj(item) for item in obj] + return obj diff --git a/tests/test_redaction_and_leak_check.py b/tests/test_redaction_and_leak_check.py new file mode 100644 index 0000000..da473db --- /dev/null +++ b/tests/test_redaction_and_leak_check.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + +from agent_bench_lab.compare import compare_score_dirs, render_markdown_report, write_csv_report +from agent_bench_lab.redaction import REDACTED, is_public_safe_text, redact_obj, redact_text + + +def load_public_leak_check(): + root = Path(__file__).resolve().parents[1] + script_path = root / "scripts" / "public_leak_check.py" + spec = importlib.util.spec_from_file_location("public_leak_check", script_path) + assert spec and spec.loader + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_redactor_removes_unsafe_strings(): + text = "correct answer was refund_escalation expected=refund_escalation CANARY_123" + + assert redact_text(text) == REDACTED + assert redact_obj({"detail": text}) == {"detail": REDACTED} + assert not is_public_safe_text(text) + + +def test_redactor_keeps_safe_feedback_readable(): + text = "numeric total is incorrect" + + assert redact_text(text) == text + assert redact_obj({"detail": text}) == {"detail": text} + assert is_public_safe_text(text) + + +def test_compare_reports_redact_unsafe_diagnostics(tmp_path): + baseline_score = tmp_path / "baseline" / "IF-01_case_001" / "score.json" + candidate_score = tmp_path / "candidate" / "IF-01_case_001" / "score.json" + baseline_score.parent.mkdir(parents=True) + candidate_score.parent.mkdir(parents=True) + baseline_score.write_text( + """ + { + "task_id": "IF-01", + "case_id": "case_001", + "score": 0.8, + "success": true, + "policy_violations": [] + } + """, + encoding="utf-8", + ) + candidate_score.write_text( + """ + { + "task_id": "IF-01", + "case_id": "case_001", + "score": 0.7, + "success": false, + "policy_violations": [ + "correct answer was refund_escalation expected=refund_escalation CANARY_123" + ] + } + """, + encoding="utf-8", + ) + + result = compare_score_dirs(tmp_path / "baseline", tmp_path / "candidate") + markdown = render_markdown_report(result) + csv_path = tmp_path / "compare.csv" + write_csv_report(result, csv_path) + csv_text = csv_path.read_text(encoding="utf-8") + + assert REDACTED in markdown + assert "refund_escalation" not in markdown + assert "CANARY_123" not in markdown + assert "expected=" not in markdown + assert "refund_escalation" not in csv_text + assert "CANARY_123" not in csv_text + + +def test_public_leak_check_fails_tracked_like_denied_path(tmp_path): + public_leak_check = load_public_leak_check() + + findings = public_leak_check.scan_paths( + tmp_path, + ["fixtures/private/case_001/answer_key.json"], + ) + + assert findings + assert "fixtures/private/case_001/answer_key.json" in findings[0] + + +def test_public_leak_check_fallback_ignores_generated_local_paths(tmp_path): + public_leak_check = load_public_leak_check() + for rel_path in [ + "runs/baseline/score.json", + "examples/artifacts/IF-01/case_001/artifact.md", + "reports/generated/compare.md", + ]: + path = tmp_path / rel_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("generated local file", encoding="utf-8") + + assert public_leak_check.scan(tmp_path) == [] + + +def test_public_leak_check_fallback_flags_private_source_tree(tmp_path): + public_leak_check = load_public_leak_check() + path = tmp_path / "fixtures" / "private" / "case_001" / "labels.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("{}", encoding="utf-8") + + findings = public_leak_check.scan(tmp_path) + + assert findings + assert "fixtures/private/case_001/labels.json" in findings[0] + + +def test_public_leak_check_passes_current_repository_tree(): + public_leak_check = load_public_leak_check() + root = Path(__file__).resolve().parents[1] + + assert public_leak_check.scan(root) == []