diff --git a/tests/fixtures/verdicts/expected.json b/tests/fixtures/verdicts/expected.json index 330e950..eff824f 100644 --- a/tests/fixtures/verdicts/expected.json +++ b/tests/fixtures/verdicts/expected.json @@ -23,5 +23,8 @@ "edge-leading-whitespace-verdict-01.md": {"mode": "advisory", "expected": "yellow"}, "edge-verdict-with-punctuation-01.md": {"mode": "advisory", "expected": "yellow"}, "edge-binary-structured-overrides-heuristic-01.md": {"mode": "binary", "expected": "pass"}, - "edge-red-heuristic-do-not-proceed-01.md": {"mode": "advisory", "expected": "red"} + "edge-red-heuristic-do-not-proceed-01.md": {"mode": "advisory", "expected": "red"}, + "gate1-decline-structured-01.md": {"mode": "gate1", "expected": "decline"}, + "gate1-decline-then-green-01.md": {"mode": "gate1", "expected": "green"}, + "gate1-decline-in-body-01.md": {"mode": "gate1", "expected": "green"} } diff --git a/tests/fixtures/verdicts/gate1-decline-in-body-01.md b/tests/fixtures/verdicts/gate1-decline-in-body-01.md new file mode 100644 index 0000000..ca12808 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-in-body-01.md @@ -0,0 +1,6 @@ +# Gate 1 — Design review + +The requirements are clear and well-scoped. I briefly weighed whether to +decline and escalate to /ceo, but a single lens suffices here — there is no +cross-cutting trade-off to synthesize. The word "decline" appears only as my +reasoning, never as a structured verdict, so it must NOT trigger escalation. diff --git a/tests/fixtures/verdicts/gate1-decline-structured-01.md b/tests/fixtures/verdicts/gate1-decline-structured-01.md new file mode 100644 index 0000000..47edb60 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-structured-01.md @@ -0,0 +1,7 @@ +# Gate 1 — Design review + +This issue sits at the intersection of security, cost, and architecture, and the +trade-offs genuinely require synthesis across multiple expert lenses — no single +reviewer can resolve them well. Escalating is the honest call. + +## Verdict: decline diff --git a/tests/fixtures/verdicts/gate1-decline-then-green-01.md b/tests/fixtures/verdicts/gate1-decline-then-green-01.md new file mode 100644 index 0000000..a2b23d1 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-then-green-01.md @@ -0,0 +1,10 @@ +# Gate 1 — Design review + +At first read this looked like it needed multiple lenses. + +## Verdict: decline + +On reflection, the CTO lens alone is sufficient — the scope is clear and the +approach is reasonable. Revising my verdict. + +## Verdict: green diff --git a/tests/test_verdict_parser.py b/tests/test_verdict_parser.py index 05c31cd..0ad12ef 100644 --- a/tests/test_verdict_parser.py +++ b/tests/test_verdict_parser.py @@ -6,9 +6,11 @@ Runs against markdown fixtures in tests/fixtures/verdicts/ and validates each fixture's parsed verdict against tests/fixtures/verdicts/expected.json. -Two parser modes: - - advisory: gate1 + gate2 advisors → green | yellow | red - - binary: gate2 binary gate → pass | fail +Three parser modes: + - advisory: gate2 advisors (and gate1's green/yellow/red path) → green | yellow | red + - binary: gate2 binary gate → pass | fail + - gate1: gate1 decline-detection (start.md step 10) → green | yellow | red | decline + (`decline` is structured-only — never a heuristic result) Both modes share the same two-step contract: 1. Structured verdict line (last wins): ^\\s*##\\s*Verdict:\\s*\\b @@ -36,6 +38,13 @@ r"^\s*##\s*Verdict:\s*(pass|fail)\b", re.IGNORECASE | re.MULTILINE ) +# gate1 emits the advisory tokens PLUS `decline` (the /ceo-escalation signal). +# This MUST match start.md step 10's expanded scan +# `^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b`. +GATE1_PATTERN = re.compile( + r"^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b", re.IGNORECASE | re.MULTILINE +) + # --- Heuristic tokens (step 2 of the contract) --- # Advisory red tokens (case-insensitive substring match) @@ -85,6 +94,24 @@ def parse_advisory(text: str) -> str: return "green" +def parse_gate1(text: str) -> str: + """Parse gate1 verdict: green | yellow | red | decline. + + Used by gate1 (start.md step 10's decline-detection scan). `decline` is the + /ceo-escalation signal and is ONLY ever a structured-line token — a free-form + mention of "decline" in the reasoning body must NOT trigger it. So the + heuristic fallback (no structured line) reuses the advisory classifier, which + can never return `decline`. + """ + # Step 1: structured verdict line — last wins (includes decline) + matches = GATE1_PATTERN.findall(text) + if matches: + return matches[-1].lower() + + # Step 2: no structured line — decline is never heuristic; defer to advisory. + return parse_advisory(text) + + def parse_binary(text: str) -> str: """Parse binary gate verdict: pass | fail. @@ -138,6 +165,8 @@ def main() -> int: actual = parse_advisory(text) elif mode == "binary": actual = parse_binary(text) + elif mode == "gate1": + actual = parse_gate1(text) else: print(f"FAIL {filename}: unknown mode '{mode}'") failed += 1