diff --git a/experiments/signum_evolve/README.md b/experiments/signum_evolve/README.md index a24ba3f..d03ce59 100644 --- a/experiments/signum_evolve/README.md +++ b/experiments/signum_evolve/README.md @@ -1,6 +1,6 @@ -# signum-evolve v0 +# signum-evolve v1 -`signum-evolve v0` is a deterministic offline experiment harness for policy scanner rule catalogs. +`signum-evolve` is a deterministic offline experiment harness for policy scanner rule catalogs. It does not change scanner behavior, mutate source catalogs, call LLMs, or apply candidate rules. It generates candidate `policy-rules.json` catalogs under `experiments/signum_evolve/out/`, evaluates them with the existing policy scanner eval harness, compares them against the frozen baseline, and exports adoption bundles for human review. @@ -21,6 +21,23 @@ When `--historical-root` is provided, `generate` discovers historical contract d Historical replay is only a review signal. It is not treated as labeled ground truth and does not auto-apply or rewrite candidate catalogs. Missing historical roots are skipped gracefully. +## What v1 Adds + +v1 keeps the same safe mutation boundary and adds adoption-grade review evidence: + +- Bounded multi-prefix candidates for one non-critical rule. +- Per-candidate `catalog_diff.json`. +- Leaderboard `rank`, `score`, `mutationCount`, and compact catalog diff metadata. +- Adoption bundle catalog diff copy and report section. + +The default v1 config is: + +```text +experiments/signum_evolve/configs/evolve.v1.json +``` + +It sets `maxMutationDepth` to `2`, which means a candidate may add one or two excluded path prefixes to the same non-critical rule. It still cannot mutate CRITICAL rules, regexes, severities, rule IDs, or source catalogs. + ## What v0 Does Not Do - No OpenEvolve. @@ -40,6 +57,8 @@ Allowed mutation operator: - `add_excluded_path_prefix` on non-CRITICAL rules only. +v1 may group multiple `add_excluded_path_prefix` mutations for the same rule into one candidate, bounded by `maxMutationDepth`. + Allowed prefixes: - `docs/` @@ -64,7 +83,7 @@ Immutable fields: ```bash python3 -m experiments.signum_evolve.cli generate \ --repo-root . \ - --config experiments/signum_evolve/configs/evolve.v0.json \ + --config experiments/signum_evolve/configs/evolve.v1.json \ --run-id smoke \ --max-candidates 5 \ --seed 42 @@ -83,7 +102,7 @@ To add optional historical replay: ```bash python3 -m experiments.signum_evolve.cli generate \ --repo-root . \ - --config experiments/signum_evolve/configs/evolve.v0.json \ + --config experiments/signum_evolve/configs/evolve.v1.json \ --run-id replay-smoke \ --max-candidates 5 \ --seed 42 \ @@ -134,6 +153,13 @@ python3 -m experiments.signum_evolve.cli leaderboard \ The leaderboard reports candidate decision, status, hard gate result, improvements, regressions, and mutation metadata. A candidate does not need to beat the current baseline to be useful; the current baseline is intentionally strong. +v1 leaderboards also include: + +- `rank`: deterministic review order +- `score`: compact ranking inputs +- `mutationCount`: number of scoped catalog edits in the candidate +- `catalogDiff`: changed rule and critical-rule change counts + When replay is enabled, each leaderboard candidate also includes compact historical replay data: ```json @@ -163,6 +189,7 @@ The bundle contains: - `candidate.json` - `policy-rules.candidate.json` +- `catalog_diff.json` - `eval.json` - `compare.json` - `historical_replay.json`, when replay was enabled diff --git a/experiments/signum_evolve/archive.py b/experiments/signum_evolve/archive.py index e52c3fc..d592966 100644 --- a/experiments/signum_evolve/archive.py +++ b/experiments/signum_evolve/archive.py @@ -1,4 +1,4 @@ -"""Run archive helpers for signum-evolve v0.""" +"""Run archive helpers for signum-evolve.""" from __future__ import annotations import shutil @@ -31,6 +31,7 @@ def write_run_manifest( config_path: Path, seed: int, max_candidates: int, + max_mutation_depth: int, candidate_count: int, baseline_summary: Dict[str, Any], ) -> Dict[str, Any]: @@ -40,6 +41,7 @@ def write_run_manifest( "config": config_path.as_posix(), "createdAt": None, "maxCandidates": max_candidates, + "maxMutationDepth": max_mutation_depth, "runId": run_id, "schemaVersion": "1.0", "seed": seed, diff --git a/experiments/signum_evolve/candidate.py b/experiments/signum_evolve/candidate.py index 50f3e2e..1661577 100644 --- a/experiments/signum_evolve/candidate.py +++ b/experiments/signum_evolve/candidate.py @@ -1,7 +1,8 @@ -"""Candidate catalog construction for signum-evolve v0.""" +"""Candidate catalog construction for signum-evolve.""" from __future__ import annotations import copy +import itertools import json from pathlib import Path from typing import Any, Dict, Iterable, List, Sequence @@ -52,38 +53,53 @@ def build_candidate( base_catalog: Dict[str, Any], index: int, seed: int, - operator: str, rule_id: str, - prefix: str, + prefixes: Sequence[str], ) -> Dict[str, Any]: catalog = copy.deepcopy(base_catalog) - changed = False + prefixes_to_add = list(prefixes) + changed_prefixes: List[str] = [] for rule in catalog["rules"]: if rule.get("ruleId") != rule_id: continue if rule.get("severity") == "CRITICAL": raise ValueError(f"critical rule cannot be mutated: {rule_id}") - prefixes = list(rule.get("excludedPathPrefixes", [])) - if operator == "add_excluded_path_prefix": - if prefix not in prefixes: - prefixes.append(prefix) - rule["excludedPathPrefixes"] = prefixes - changed = True - else: - raise ValueError(f"unsupported mutation operator: {operator}") + rule_prefixes = list(rule.get("excludedPathPrefixes", [])) + for prefix in prefixes_to_add: + if prefix not in rule_prefixes: + rule_prefixes.append(prefix) + changed_prefixes.append(prefix) + if changed_prefixes: + rule["excludedPathPrefixes"] = rule_prefixes break - if not changed: - raise ValueError(f"mutation produced no catalog change: {rule_id} {prefix}") + if not changed_prefixes: + raise ValueError(f"mutation produced no catalog change: {rule_id} {list(prefixes_to_add)}") + + mutations = [ + { + "operator": "add_excluded_path_prefix", + "prefix": prefix, + "ruleId": rule_id, + } + for prefix in changed_prefixes + ] + mutation: Dict[str, Any] + if len(mutations) == 1: + mutation = dict(mutations[0]) + else: + mutation = { + "operator": "add_excluded_path_prefix_set", + "prefixes": changed_prefixes, + "ruleId": rule_id, + } return { "candidateId": candidate_id(index), "catalog": catalog, "createdAt": None, - "mutation": { - "operator": operator, - "prefix": prefix, - "ruleId": rule_id, - }, + "mutation": mutation, + "mutationCount": len(mutations), + "mutations": mutations, "parentId": "baseline", "schemaVersion": "1.0", "seed": seed, @@ -96,28 +112,29 @@ def generate_candidates( max_candidates: int, seed: int, allowed_prefixes: Sequence[str] = DEFAULT_ALLOWED_PREFIXES, + max_mutation_depth: int = 1, ) -> List[Dict[str, Any]]: candidates: List[Dict[str, Any]] = [] - if max_candidates <= 0: + if max_candidates <= 0 or max_mutation_depth <= 0: return candidates next_index = 1 for rule in noncritical_rules(catalog): rule_id = str(rule.get("ruleId")) existing_prefixes = set(rule.get("excludedPathPrefixes", [])) - for prefix in allowed_prefixes: - if prefix in existing_prefixes: - continue - candidates.append( - build_candidate( - base_catalog=catalog, - index=next_index, - seed=seed, - operator="add_excluded_path_prefix", - rule_id=rule_id, - prefix=prefix, + missing_prefixes = [prefix for prefix in allowed_prefixes if prefix not in existing_prefixes] + max_depth = min(max_mutation_depth, len(missing_prefixes)) + for depth in range(1, max_depth + 1): + for prefix_set in itertools.combinations(missing_prefixes, depth): + candidates.append( + build_candidate( + base_catalog=catalog, + index=next_index, + seed=seed, + rule_id=rule_id, + prefixes=prefix_set, + ) ) - ) - next_index += 1 - if len(candidates) >= max_candidates: - return candidates + next_index += 1 + if len(candidates) >= max_candidates: + return candidates return candidates diff --git a/experiments/signum_evolve/catalog_diff.py b/experiments/signum_evolve/catalog_diff.py new file mode 100644 index 0000000..8b63164 --- /dev/null +++ b/experiments/signum_evolve/catalog_diff.py @@ -0,0 +1,79 @@ +"""Catalog diff helpers for signum-evolve candidate review.""" +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List + +from .candidate import write_json +from .mutate import IMMUTABLE_RULE_FIELDS, rule_by_id + + +def _as_string_list(value: Any) -> List[str]: + if not isinstance(value, list): + return [] + return sorted(item for item in value if isinstance(item, str)) + + +def diff_catalogs(base_catalog: Dict[str, Any], candidate_catalog: Dict[str, Any]) -> Dict[str, Any]: + base_rules = rule_by_id(base_catalog) + candidate_rules = rule_by_id(candidate_catalog) + rule_ids = sorted(set(base_rules) | set(candidate_rules)) + changes: List[Dict[str, Any]] = [] + critical_rule_changes: List[str] = [] + + for rule_id in rule_ids: + base_rule = base_rules.get(rule_id) + candidate_rule = candidate_rules.get(rule_id) + if base_rule is None: + changes.append({"changeType": "rule_added", "ruleId": rule_id}) + critical_rule_changes.append(rule_id) + continue + if candidate_rule is None: + changes.append({"changeType": "rule_removed", "ruleId": rule_id}) + if base_rule.get("severity") == "CRITICAL": + critical_rule_changes.append(rule_id) + continue + + base_prefixes = set(_as_string_list(base_rule.get("excludedPathPrefixes"))) + candidate_prefixes = set(_as_string_list(candidate_rule.get("excludedPathPrefixes"))) + added_prefixes = sorted(candidate_prefixes - base_prefixes) + removed_prefixes = sorted(base_prefixes - candidate_prefixes) + immutable_changes = sorted( + field + for field in IMMUTABLE_RULE_FIELDS + if base_rule.get(field) != candidate_rule.get(field) + ) + + if not added_prefixes and not removed_prefixes and not immutable_changes: + continue + + change = { + "addedExcludedPathPrefixes": added_prefixes, + "autoBlock": candidate_rule.get("autoBlock"), + "immutableFieldChanges": immutable_changes, + "pattern": candidate_rule.get("pattern"), + "removedExcludedPathPrefixes": removed_prefixes, + "ruleId": rule_id, + "severity": candidate_rule.get("severity"), + "type": candidate_rule.get("type"), + } + changes.append(change) + + if base_rule.get("severity") == "CRITICAL" and ( + added_prefixes or removed_prefixes or immutable_changes + ): + critical_rule_changes.append(rule_id) + + return { + "changedRuleCount": len(changes), + "changes": changes, + "criticalRuleChanges": sorted(set(critical_rule_changes)), + "criticalRuleChangesCount": len(set(critical_rule_changes)), + "schemaVersion": "1.0", + } + + +def write_catalog_diff(candidate_dir: Path, diff: Dict[str, Any]) -> Path: + path = candidate_dir / "catalog_diff.json" + write_json(path, diff) + return path diff --git a/experiments/signum_evolve/cli.py b/experiments/signum_evolve/cli.py index 4eddb06..b72ba5e 100644 --- a/experiments/signum_evolve/cli.py +++ b/experiments/signum_evolve/cli.py @@ -1,4 +1,4 @@ -"""CLI for signum-evolve v0.""" +"""CLI for signum-evolve.""" from __future__ import annotations import argparse @@ -8,6 +8,7 @@ from .archive import archive_candidate, prepare_run_dir, write_run_manifest from .candidate import DEFAULT_ALLOWED_PREFIXES, canonical_json, generate_candidates, load_catalog, load_json +from .catalog_diff import diff_catalogs, write_catalog_diff from .export import export_bundle from .mutate import validate_scope_only_mutation from .report import baseline_summary_from_scorecard, write_leaderboard @@ -46,6 +47,7 @@ def command_generate(args: argparse.Namespace) -> int: ).resolve() historical_root = repo_path(repo_root, args.historical_root).resolve() if args.historical_root else None allowed_prefixes = tuple(config.get("allowedPrefixes", DEFAULT_ALLOWED_PREFIXES)) + max_mutation_depth = int(config.get("maxMutationDepth", 1)) catalog = load_catalog(catalog_path) baseline_scorecard = load_json(baseline_path) @@ -54,6 +56,7 @@ def command_generate(args: argparse.Namespace) -> int: max_candidates=args.max_candidates, seed=args.seed, allowed_prefixes=allowed_prefixes, + max_mutation_depth=max_mutation_depth, ) if not candidates: raise RuntimeError("no candidates generated") @@ -64,6 +67,7 @@ def command_generate(args: argparse.Namespace) -> int: if errors: raise RuntimeError(f"{candidate['candidateId']} failed mutation validation: {errors}") candidate_dir = archive_candidate(run_dir, candidate) + write_catalog_diff(candidate_dir, diff_catalogs(catalog, candidate["catalog"])) evaluate_candidate(repo_root, candidate_dir, baseline_path) if historical_root is not None: replay = run_historical_replay( @@ -81,6 +85,7 @@ def command_generate(args: argparse.Namespace) -> int: config_path=manifest_path_ref(repo_root, config_path), seed=args.seed, max_candidates=args.max_candidates, + max_mutation_depth=max_mutation_depth, candidate_count=len(candidates), baseline_summary=baseline_summary_from_scorecard(baseline_scorecard), ) @@ -102,7 +107,7 @@ def command_export(args: argparse.Namespace) -> int: def main(argv: Optional[Sequence[str]] = None) -> int: - parser = argparse.ArgumentParser(description="Offline Signum evolve v0 candidate generator.") + parser = argparse.ArgumentParser(description="Offline Signum evolve candidate generator.") subcommands = parser.add_subparsers(dest="command", required=True) generate = subcommands.add_parser("generate", help="Generate and evaluate candidate catalogs.") diff --git a/experiments/signum_evolve/configs/evolve.v1.json b/experiments/signum_evolve/configs/evolve.v1.json new file mode 100644 index 0000000..eb63a1b --- /dev/null +++ b/experiments/signum_evolve/configs/evolve.v1.json @@ -0,0 +1,17 @@ +{ + "allowedPrefixes": [ + "docs/", + "examples/", + "fixtures/", + "tests/", + "test/", + "generated/" + ], + "baselineCatalog": "lib/policy-rules.json", + "baselineScorecard": "evals/policy_scanner/baselines/current.json", + "maxMutationDepth": 2, + "operators": [ + "add_excluded_path_prefix" + ], + "schemaVersion": "1.0" +} diff --git a/experiments/signum_evolve/export.py b/experiments/signum_evolve/export.py index 7a5134e..23867e6 100644 --- a/experiments/signum_evolve/export.py +++ b/experiments/signum_evolve/export.py @@ -1,4 +1,4 @@ -"""Adoption bundle export for signum-evolve v0.""" +"""Adoption bundle export for signum-evolve.""" from __future__ import annotations import shutil @@ -27,6 +27,11 @@ def copy_required(candidate_dir: Path, out_dir: Path) -> Dict[str, Path]: replay_target = out_dir / "historical_replay.json" shutil.copyfile(replay_source, replay_target) mapping["historicalReplay"] = (replay_source, replay_target) + diff_source = candidate_dir / "catalog_diff.json" + if diff_source.exists(): + diff_target = out_dir / "catalog_diff.json" + shutil.copyfile(diff_source, diff_target) + mapping["catalogDiff"] = (diff_source, diff_target) return {key: target for key, (_source, target) in mapping.items()} @@ -68,6 +73,7 @@ def write_report( candidate: Dict[str, Any], compare: Dict[str, Any], replay: Optional[Dict[str, Any]], + catalog_diff: Optional[Dict[str, Any]], ) -> None: decision = decision_with_replay(compare, replay) lines = [ @@ -80,7 +86,9 @@ def write_report( f"- Hard gate passed: `{compare.get('hardGatePassed')}`", f"- Improvements: `{len(compare.get('improvements', []))}`", f"- Regressions: `{len(compare.get('regressions', []))}`", + f"- Mutation count: `{candidate.get('mutationCount', len(candidate.get('mutations', [])))}`", "", + *catalog_diff_lines(catalog_diff), *historical_replay_lines(replay), "## Expected Files To Change If Adopted", "", @@ -96,11 +104,38 @@ def write_report( (out_dir / "report.md").write_text("\n".join(lines), encoding="utf-8") +def catalog_diff_lines(catalog_diff: Optional[Dict[str, Any]]) -> list[str]: + if catalog_diff is None: + return [] + lines = [ + "## Catalog Diff", + "", + f"- Changed rules: `{catalog_diff.get('changedRuleCount', 0)}`", + f"- Critical rule changes: `{catalog_diff.get('criticalRuleChangesCount', 0)}`", + ] + for change in catalog_diff.get("changes", []): + if not isinstance(change, dict): + continue + added = ", ".join(change.get("addedExcludedPathPrefixes", [])) or "none" + removed = ", ".join(change.get("removedExcludedPathPrefixes", [])) or "none" + lines.append( + "- `{rule}` ({severity}): add excluded prefixes `{added}`, remove `{removed}`".format( + added=added, + removed=removed, + rule=change.get("ruleId"), + severity=change.get("severity"), + ) + ) + lines.append("") + return lines + + def write_checklist(out_dir: Path) -> None: lines = [ "# Adoption Checklist", "", "- [ ] Candidate generated offline.", + "- [ ] Catalog diff reviewed.", "- [ ] No CRITICAL rules changed.", "- [ ] Hard gates passed.", "- [ ] Comparison reviewed.", @@ -123,6 +158,8 @@ def export_bundle(run_dir: Path, candidate_id: str, out_dir: Path) -> Path: compare = load_json(candidate_dir / "compare.json") replay_path = candidate_dir / "historical_replay.json" replay = load_json(replay_path) if replay_path.exists() else None - write_report(out_dir, candidate, compare, replay) + diff_path = candidate_dir / "catalog_diff.json" + catalog_diff = load_json(diff_path) if diff_path.exists() else None + write_report(out_dir, candidate, compare, replay, catalog_diff) write_checklist(out_dir) return out_dir diff --git a/experiments/signum_evolve/mutate.py b/experiments/signum_evolve/mutate.py index f00eaf5..f37cc70 100644 --- a/experiments/signum_evolve/mutate.py +++ b/experiments/signum_evolve/mutate.py @@ -1,4 +1,4 @@ -"""Mutation policy helpers for signum-evolve v0.""" +"""Mutation policy helpers for signum-evolve.""" from __future__ import annotations from typing import Any, Dict, Iterable, List @@ -44,6 +44,13 @@ def mutation_summary(candidate: Dict[str, Any]) -> str: mutation = candidate.get("mutation", {}) if not isinstance(mutation, dict): return "unknown mutation" + if isinstance(mutation.get("prefixes"), list): + prefixes = ",".join(str(prefix) for prefix in mutation["prefixes"]) + return "{operator} {ruleId} {prefixes}".format( + operator=mutation.get("operator", "unknown"), + ruleId=mutation.get("ruleId", "unknown"), + prefixes=prefixes, + ).strip() return "{operator} {ruleId} {prefix}".format( operator=mutation.get("operator", "unknown"), ruleId=mutation.get("ruleId", "unknown"), diff --git a/experiments/signum_evolve/report.py b/experiments/signum_evolve/report.py index ee7c6b9..cdacde5 100644 --- a/experiments/signum_evolve/report.py +++ b/experiments/signum_evolve/report.py @@ -1,4 +1,4 @@ -"""Report and leaderboard helpers for signum-evolve v0.""" +"""Report and leaderboard helpers for signum-evolve.""" from __future__ import annotations from pathlib import Path @@ -18,20 +18,73 @@ def baseline_summary_from_scorecard(scorecard: Dict[str, Any]) -> Dict[str, Any] } +def replay_drift_count(replay: Dict[str, Any] | None) -> int: + if not replay: + return 0 + return sum( + int(replay.get(field, 0) or 0) + for field in ( + "newFindingsCount", + "removedFindingsCount", + "changedSeverityCount", + "changedRuleCount", + ) + ) + + +def candidate_score(compare: Dict[str, Any], replay: Dict[str, Any] | None) -> Dict[str, Any]: + return { + "catalogChangedRuleCount": 0, + "hardGatePassed": compare.get("hardGatePassed") is True, + "improvementCount": len(compare.get("improvements", [])), + "regressionCount": len(compare.get("regressions", [])), + "removedCriticalFindingsCount": int((replay or {}).get("removedCriticalFindingsCount", 0) or 0), + "replayDriftCount": replay_drift_count(replay), + } + + +def rank_key(entry: Dict[str, Any]) -> tuple[Any, ...]: + score = entry.get("score", {}) + decision_order = {"accept": 0, "review": 1, "reject": 2} + status_order = {"better": 0, "equivalent": 1, "mixed": 2, "worse": 3} + return ( + 0 if score.get("hardGatePassed") else 1, + 0 if int(score.get("removedCriticalFindingsCount", 0) or 0) == 0 else 1, + decision_order.get(str(entry.get("decision")), 3), + status_order.get(str(entry.get("status")), 4), + int(score.get("regressionCount", 0) or 0), + -int(score.get("improvementCount", 0) or 0), + int(score.get("replayDriftCount", 0) or 0), + str(entry.get("candidateId", "")), + ) + + def leaderboard_entry(candidate_dir: Path) -> Dict[str, Any]: candidate = load_json(candidate_dir / "candidate.json") compare = load_json(candidate_dir / "compare.json") + catalog_diff_path = candidate_dir / "catalog_diff.json" + catalog_diff = load_json(catalog_diff_path) if catalog_diff_path.exists() else None replay_path = candidate_dir / "historical_replay.json" replay = load_json(replay_path) if replay_path.exists() else None + score = candidate_score(compare, replay) + if catalog_diff is not None: + score["catalogChangedRuleCount"] = int(catalog_diff.get("changedRuleCount", 0) or 0) entry = { "candidateId": candidate["candidateId"], "decision": decision_with_replay(compare, replay), "hardGatePassed": compare.get("hardGatePassed"), "improvements": compare.get("improvements", []), "mutation": candidate.get("mutation", {}), + "mutationCount": candidate.get("mutationCount", len(candidate.get("mutations", []))), "regressions": compare.get("regressions", []), + "score": score, "status": compare.get("status"), } + if catalog_diff is not None: + entry["catalogDiff"] = { + "changedRuleCount": catalog_diff.get("changedRuleCount", 0), + "criticalRuleChangesCount": catalog_diff.get("criticalRuleChangesCount", 0), + } if replay is not None: entry["historicalReplay"] = compact_historical_replay(replay) return entry @@ -39,9 +92,13 @@ def leaderboard_entry(candidate_dir: Path) -> Dict[str, Any]: def build_leaderboard(run_dir: Path, run_id: str, baseline_scorecard: Dict[str, Any]) -> Dict[str, Any]: candidate_dirs = sorted((run_dir / "candidates").glob("cand_*")) + entries = [leaderboard_entry(path) for path in candidate_dirs] + entries = sorted(entries, key=rank_key) + for index, entry in enumerate(entries, start=1): + entry["rank"] = index return { "baseline": baseline_summary_from_scorecard(baseline_scorecard), - "candidates": [leaderboard_entry(path) for path in candidate_dirs], + "candidates": entries, "runId": run_id, "schemaVersion": "1.0", } diff --git a/tests/test-signum-evolve-v1.sh b/tests/test-signum-evolve-v1.sh new file mode 100644 index 0000000..c103ac3 --- /dev/null +++ b/tests/test-signum-evolve-v1.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(CDPATH= cd "$(dirname "$0")/.." && pwd)" +RUN_ID="test-v1" +RUN_DIR="$ROOT_DIR/experiments/signum_evolve/out/$RUN_ID" +WORK="$(mktemp -d)" +EXPORT_DIR="$(mktemp -d)" +trap 'rm -rf "$WORK" "$EXPORT_DIR"; rm -rf "$RUN_DIR"' EXIT + +hash_file() { + shasum -a 256 "$1" | awk '{print $1}' +} + +SCANNER_HASH_BEFORE=$(hash_file "$ROOT_DIR/lib/policy-scanner.sh") +CATALOG_HASH_BEFORE=$(hash_file "$ROOT_DIR/lib/policy-rules.json") +OVERLAY_SCANNER_HASH_BEFORE=$(hash_file "$ROOT_DIR/platforms/claude-code/lib/policy-scanner.sh") +OVERLAY_CATALOG_HASH_BEFORE=$(hash_file "$ROOT_DIR/platforms/claude-code/lib/policy-rules.json") + +rm -rf "$RUN_DIR" + +cat > "$WORK/evolve-v1-test.json" <<'JSON' +{ + "allowedPrefixes": [ + "docs/", + "examples/" + ], + "baselineCatalog": "lib/policy-rules.json", + "baselineScorecard": "evals/policy_scanner/baselines/current.json", + "maxMutationDepth": 2, + "operators": [ + "add_excluded_path_prefix" + ], + "schemaVersion": "1.0" +} +JSON + +python3 -m experiments.signum_evolve.cli generate \ + --repo-root "$ROOT_DIR" \ + --config "$WORK/evolve-v1-test.json" \ + --run-id "$RUN_ID" \ + --max-candidates 3 \ + --seed 42 \ + > "$WORK/generate.json" + +test -f "$RUN_DIR/run_manifest.json" +test -f "$RUN_DIR/leaderboard.json" +python3 -m json.tool "$RUN_DIR/run_manifest.json" >/dev/null +python3 -m json.tool "$RUN_DIR/leaderboard.json" >/dev/null +jq -e '.candidateCount == 3 and .maxMutationDepth == 2' "$RUN_DIR/run_manifest.json" >/dev/null +jq -e '.config == "external:evolve-v1-test.json"' "$RUN_DIR/run_manifest.json" >/dev/null + +for candidate in cand_000001 cand_000002 cand_000003; do + candidate_dir="$RUN_DIR/candidates/$candidate" + for file in candidate.json policy-rules.json catalog_diff.json eval.json compare.json; do + test -f "$candidate_dir/$file" + python3 -m json.tool "$candidate_dir/$file" >/dev/null + done +done + +MULTI="$RUN_DIR/candidates/cand_000003" +jq -e '.mutation.operator == "add_excluded_path_prefix_set"' "$MULTI/candidate.json" >/dev/null +jq -e '.mutationCount == 2 and (.mutations | length == 2)' "$MULTI/candidate.json" >/dev/null +jq -e '.mutation.prefixes == ["docs/", "examples/"]' "$MULTI/candidate.json" >/dev/null +jq -e '.changes[0].ruleId == "POLICY_WEAK_CRYPTO"' "$MULTI/catalog_diff.json" >/dev/null +jq -e '.changes[0].addedExcludedPathPrefixes == ["docs/", "examples/"]' "$MULTI/catalog_diff.json" >/dev/null +jq -e '.criticalRuleChangesCount == 0' "$MULTI/catalog_diff.json" >/dev/null + +jq -e '.candidates | length == 3' "$RUN_DIR/leaderboard.json" >/dev/null +jq -e '.candidates[0] | has("rank") and has("score") and has("catalogDiff") and has("mutationCount")' "$RUN_DIR/leaderboard.json" >/dev/null +jq -e '[.candidates[].rank] == [1, 2, 3]' "$RUN_DIR/leaderboard.json" >/dev/null +jq -e '.candidates[] | select(.candidateId == "cand_000003") | .mutationCount == 2' "$RUN_DIR/leaderboard.json" >/dev/null +jq -e '.candidates[] | select(.candidateId == "cand_000003") | .catalogDiff.changedRuleCount == 1' "$RUN_DIR/leaderboard.json" >/dev/null + +python3 -m experiments.signum_evolve.cli export \ + --run "$RUN_DIR" \ + --candidate cand_000003 \ + --out "$EXPORT_DIR/adoption-bundle" \ + > "$WORK/export.json" + +for file in candidate.json policy-rules.candidate.json catalog_diff.json eval.json compare.json report.md adoption-checklist.md; do + test -f "$EXPORT_DIR/adoption-bundle/$file" +done + +grep -Fq "## Catalog Diff" "$EXPORT_DIR/adoption-bundle/report.md" +grep -Fq "Changed rules: \`1\`" "$EXPORT_DIR/adoption-bundle/report.md" +grep -Fq "POLICY_WEAK_CRYPTO" "$EXPORT_DIR/adoption-bundle/report.md" +grep -Fq "Catalog diff reviewed" "$EXPORT_DIR/adoption-bundle/adoption-checklist.md" + +if [ "$SCANNER_HASH_BEFORE" != "$(hash_file "$ROOT_DIR/lib/policy-scanner.sh")" ]; then + echo "root scanner changed during v1 evolve run" >&2 + exit 1 +fi +if [ "$CATALOG_HASH_BEFORE" != "$(hash_file "$ROOT_DIR/lib/policy-rules.json")" ]; then + echo "root policy catalog changed during v1 evolve run" >&2 + exit 1 +fi +if [ "$OVERLAY_SCANNER_HASH_BEFORE" != "$(hash_file "$ROOT_DIR/platforms/claude-code/lib/policy-scanner.sh")" ]; then + echo "overlay scanner changed during v1 evolve run" >&2 + exit 1 +fi +if [ "$OVERLAY_CATALOG_HASH_BEFORE" != "$(hash_file "$ROOT_DIR/platforms/claude-code/lib/policy-rules.json")" ]; then + echo "overlay policy catalog changed during v1 evolve run" >&2 + exit 1 +fi + +if git -C "$ROOT_DIR" status --short .signum | grep -q .; then + echo ".signum has tracked or staged changes" >&2 + exit 1 +fi + +git -C "$ROOT_DIR" check-ignore -q experiments/signum_evolve/out/test-v1 + +echo "signum-evolve v1 smoke passed"