From b81d89d6bb1b698c25ac08f4520a2610dcd9911c Mon Sep 17 00:00:00 2001 From: JFK Date: Sat, 13 Jun 2026 14:32:15 +0900 Subject: [PATCH] test: add decline-token coverage to the verdict-parser contract (#88) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reference parser (tests/test_verdict_parser.py) modeled only the advisory (green|yellow|red) and binary (pass|fail) tokens, but gate1 also emits `decline` (the /ceo-escalation signal, start.md step 10) — so the contract test and the command had drifted: a structured `## Verdict: decline` was unguarded. Add a `gate1` parser mode whose pattern mirrors start.md step 10 (green|yellow|red|decline), with `decline` as a structured-only token (a free-form mention in the reasoning body must not trigger it; the no-structured fallback reuses the advisory classifier, which can never return decline). Three fixtures lock the behavior: structured-decline → decline; decline-then- green → green (last-wins, no escalation); decline-in-body-only → green. Scope note: the broader "#88 parser-contract tests" was found ~90% already covered by the existing 25 fixtures + enum-sync-check.sh + jq-sync-check.sh (gate1, yellow). This PR closes the one genuine remaining gap — `decline`. 28 fixtures pass (was 25). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/fixtures/verdicts/expected.json | 5 ++- .../verdicts/gate1-decline-in-body-01.md | 6 ++++ .../verdicts/gate1-decline-structured-01.md | 7 ++++ .../verdicts/gate1-decline-then-green-01.md | 10 ++++++ tests/test_verdict_parser.py | 35 +++++++++++++++++-- 5 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 tests/fixtures/verdicts/gate1-decline-in-body-01.md create mode 100644 tests/fixtures/verdicts/gate1-decline-structured-01.md create mode 100644 tests/fixtures/verdicts/gate1-decline-then-green-01.md diff --git a/tests/fixtures/verdicts/expected.json b/tests/fixtures/verdicts/expected.json index 330e950..eff824f 100644 --- a/tests/fixtures/verdicts/expected.json +++ b/tests/fixtures/verdicts/expected.json @@ -23,5 +23,8 @@ "edge-leading-whitespace-verdict-01.md": {"mode": "advisory", "expected": "yellow"}, "edge-verdict-with-punctuation-01.md": {"mode": "advisory", "expected": "yellow"}, "edge-binary-structured-overrides-heuristic-01.md": {"mode": "binary", "expected": "pass"}, - "edge-red-heuristic-do-not-proceed-01.md": {"mode": "advisory", "expected": "red"} + "edge-red-heuristic-do-not-proceed-01.md": {"mode": "advisory", "expected": "red"}, + "gate1-decline-structured-01.md": {"mode": "gate1", "expected": "decline"}, + "gate1-decline-then-green-01.md": {"mode": "gate1", "expected": "green"}, + "gate1-decline-in-body-01.md": {"mode": "gate1", "expected": "green"} } diff --git a/tests/fixtures/verdicts/gate1-decline-in-body-01.md b/tests/fixtures/verdicts/gate1-decline-in-body-01.md new file mode 100644 index 0000000..ca12808 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-in-body-01.md @@ -0,0 +1,6 @@ +# Gate 1 — Design review + +The requirements are clear and well-scoped. I briefly weighed whether to +decline and escalate to /ceo, but a single lens suffices here — there is no +cross-cutting trade-off to synthesize. The word "decline" appears only as my +reasoning, never as a structured verdict, so it must NOT trigger escalation. diff --git a/tests/fixtures/verdicts/gate1-decline-structured-01.md b/tests/fixtures/verdicts/gate1-decline-structured-01.md new file mode 100644 index 0000000..47edb60 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-structured-01.md @@ -0,0 +1,7 @@ +# Gate 1 — Design review + +This issue sits at the intersection of security, cost, and architecture, and the +trade-offs genuinely require synthesis across multiple expert lenses — no single +reviewer can resolve them well. Escalating is the honest call. + +## Verdict: decline diff --git a/tests/fixtures/verdicts/gate1-decline-then-green-01.md b/tests/fixtures/verdicts/gate1-decline-then-green-01.md new file mode 100644 index 0000000..a2b23d1 --- /dev/null +++ b/tests/fixtures/verdicts/gate1-decline-then-green-01.md @@ -0,0 +1,10 @@ +# Gate 1 — Design review + +At first read this looked like it needed multiple lenses. + +## Verdict: decline + +On reflection, the CTO lens alone is sufficient — the scope is clear and the +approach is reasonable. Revising my verdict. + +## Verdict: green diff --git a/tests/test_verdict_parser.py b/tests/test_verdict_parser.py index 05c31cd..0ad12ef 100644 --- a/tests/test_verdict_parser.py +++ b/tests/test_verdict_parser.py @@ -6,9 +6,11 @@ Runs against markdown fixtures in tests/fixtures/verdicts/ and validates each fixture's parsed verdict against tests/fixtures/verdicts/expected.json. -Two parser modes: - - advisory: gate1 + gate2 advisors → green | yellow | red - - binary: gate2 binary gate → pass | fail +Three parser modes: + - advisory: gate2 advisors (and gate1's green/yellow/red path) → green | yellow | red + - binary: gate2 binary gate → pass | fail + - gate1: gate1 decline-detection (start.md step 10) → green | yellow | red | decline + (`decline` is structured-only — never a heuristic result) Both modes share the same two-step contract: 1. Structured verdict line (last wins): ^\\s*##\\s*Verdict:\\s*\\b @@ -36,6 +38,13 @@ r"^\s*##\s*Verdict:\s*(pass|fail)\b", re.IGNORECASE | re.MULTILINE ) +# gate1 emits the advisory tokens PLUS `decline` (the /ceo-escalation signal). +# This MUST match start.md step 10's expanded scan +# `^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b`. +GATE1_PATTERN = re.compile( + r"^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b", re.IGNORECASE | re.MULTILINE +) + # --- Heuristic tokens (step 2 of the contract) --- # Advisory red tokens (case-insensitive substring match) @@ -85,6 +94,24 @@ def parse_advisory(text: str) -> str: return "green" +def parse_gate1(text: str) -> str: + """Parse gate1 verdict: green | yellow | red | decline. + + Used by gate1 (start.md step 10's decline-detection scan). `decline` is the + /ceo-escalation signal and is ONLY ever a structured-line token — a free-form + mention of "decline" in the reasoning body must NOT trigger it. So the + heuristic fallback (no structured line) reuses the advisory classifier, which + can never return `decline`. + """ + # Step 1: structured verdict line — last wins (includes decline) + matches = GATE1_PATTERN.findall(text) + if matches: + return matches[-1].lower() + + # Step 2: no structured line — decline is never heuristic; defer to advisory. + return parse_advisory(text) + + def parse_binary(text: str) -> str: """Parse binary gate verdict: pass | fail. @@ -138,6 +165,8 @@ def main() -> int: actual = parse_advisory(text) elif mode == "binary": actual = parse_binary(text) + elif mode == "gate1": + actual = parse_gate1(text) else: print(f"FAIL {filename}: unknown mode '{mode}'") failed += 1