From b81d89d6bb1b698c25ac08f4520a2610dcd9911c Mon Sep 17 00:00:00 2001
From: JFK <fumikazu.kiyota@gmail.com>
Date: Sat, 13 Jun 2026 14:32:15 +0900
Subject: [PATCH] test: add decline-token coverage to the verdict-parser
 contract (#88)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reference parser (tests/test_verdict_parser.py) modeled only the advisory
(green|yellow|red) and binary (pass|fail) tokens, but gate1 also emits `decline`
(the /ceo-escalation signal, start.md step 10) — so the contract test and the
command had drifted: a structured `## Verdict: decline` was unguarded.

Add a `gate1` parser mode whose pattern mirrors start.md step 10
(green|yellow|red|decline), with `decline` as a structured-only token (a
free-form mention in the reasoning body must not trigger it; the no-structured
fallback reuses the advisory classifier, which can never return decline).

Three fixtures lock the behavior: structured-decline → decline; decline-then-
green → green (last-wins, no escalation); decline-in-body-only → green.

Scope note: the broader "#88 parser-contract tests" was found ~90% already
covered by the existing 25 fixtures + enum-sync-check.sh + jq-sync-check.sh
(gate1, yellow). This PR closes the one genuine remaining gap — `decline`.

28 fixtures pass (was 25).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/fixtures/verdicts/expected.json         |  5 ++-
 .../verdicts/gate1-decline-in-body-01.md      |  6 ++++
 .../verdicts/gate1-decline-structured-01.md   |  7 ++++
 .../verdicts/gate1-decline-then-green-01.md   | 10 ++++++
 tests/test_verdict_parser.py                  | 35 +++++++++++++++++--
 5 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 tests/fixtures/verdicts/gate1-decline-in-body-01.md
 create mode 100644 tests/fixtures/verdicts/gate1-decline-structured-01.md
 create mode 100644 tests/fixtures/verdicts/gate1-decline-then-green-01.md

diff --git a/tests/fixtures/verdicts/expected.json b/tests/fixtures/verdicts/expected.json
index 330e950..eff824f 100644
--- a/tests/fixtures/verdicts/expected.json
+++ b/tests/fixtures/verdicts/expected.json
@@ -23,5 +23,8 @@
   "edge-leading-whitespace-verdict-01.md":        {"mode": "advisory", "expected": "yellow"},
   "edge-verdict-with-punctuation-01.md":          {"mode": "advisory", "expected": "yellow"},
   "edge-binary-structured-overrides-heuristic-01.md": {"mode": "binary", "expected": "pass"},
-  "edge-red-heuristic-do-not-proceed-01.md":      {"mode": "advisory", "expected": "red"}
+  "edge-red-heuristic-do-not-proceed-01.md":      {"mode": "advisory", "expected": "red"},
+  "gate1-decline-structured-01.md":               {"mode": "gate1",    "expected": "decline"},
+  "gate1-decline-then-green-01.md":               {"mode": "gate1",    "expected": "green"},
+  "gate1-decline-in-body-01.md":                  {"mode": "gate1",    "expected": "green"}
 }
diff --git a/tests/fixtures/verdicts/gate1-decline-in-body-01.md b/tests/fixtures/verdicts/gate1-decline-in-body-01.md
new file mode 100644
index 0000000..ca12808
--- /dev/null
+++ b/tests/fixtures/verdicts/gate1-decline-in-body-01.md
@@ -0,0 +1,6 @@
+# Gate 1 — Design review
+
+The requirements are clear and well-scoped. I briefly weighed whether to
+decline and escalate to /ceo, but a single lens suffices here — there is no
+cross-cutting trade-off to synthesize. The word "decline" appears only as my
+reasoning, never as a structured verdict, so it must NOT trigger escalation.
diff --git a/tests/fixtures/verdicts/gate1-decline-structured-01.md b/tests/fixtures/verdicts/gate1-decline-structured-01.md
new file mode 100644
index 0000000..47edb60
--- /dev/null
+++ b/tests/fixtures/verdicts/gate1-decline-structured-01.md
@@ -0,0 +1,7 @@
+# Gate 1 — Design review
+
+This issue sits at the intersection of security, cost, and architecture, and the
+trade-offs genuinely require synthesis across multiple expert lenses — no single
+reviewer can resolve them well. Escalating is the honest call.
+
+## Verdict: decline
diff --git a/tests/fixtures/verdicts/gate1-decline-then-green-01.md b/tests/fixtures/verdicts/gate1-decline-then-green-01.md
new file mode 100644
index 0000000..a2b23d1
--- /dev/null
+++ b/tests/fixtures/verdicts/gate1-decline-then-green-01.md
@@ -0,0 +1,10 @@
+# Gate 1 — Design review
+
+At first read this looked like it needed multiple lenses.
+
+## Verdict: decline
+
+On reflection, the CTO lens alone is sufficient — the scope is clear and the
+approach is reasonable. Revising my verdict.
+
+## Verdict: green
diff --git a/tests/test_verdict_parser.py b/tests/test_verdict_parser.py
index 05c31cd..0ad12ef 100644
--- a/tests/test_verdict_parser.py
+++ b/tests/test_verdict_parser.py
@@ -6,9 +6,11 @@
 Runs against markdown fixtures in tests/fixtures/verdicts/ and validates
 each fixture's parsed verdict against tests/fixtures/verdicts/expected.json.
 
-Two parser modes:
-  - advisory: gate1 + gate2 advisors → green | yellow | red
-  - binary:   gate2 binary gate      → pass | fail
+Three parser modes:
+  - advisory: gate2 advisors (and gate1's green/yellow/red path) → green | yellow | red
+  - binary:   gate2 binary gate                                  → pass | fail
+  - gate1:    gate1 decline-detection (start.md step 10)         → green | yellow | red | decline
+              (`decline` is structured-only — never a heuristic result)
 
 Both modes share the same two-step contract:
   1. Structured verdict line (last wins): ^\\s*##\\s*Verdict:\\s*<token>\\b
@@ -36,6 +38,13 @@
     r"^\s*##\s*Verdict:\s*(pass|fail)\b", re.IGNORECASE | re.MULTILINE
 )
 
+# gate1 emits the advisory tokens PLUS `decline` (the /ceo-escalation signal).
+# This MUST match start.md step 10's expanded scan
+# `^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b`.
+GATE1_PATTERN = re.compile(
+    r"^\s*##\s*Verdict:\s*(green|yellow|red|decline)\b", re.IGNORECASE | re.MULTILINE
+)
+
 # --- Heuristic tokens (step 2 of the contract) ---
 
 # Advisory red tokens (case-insensitive substring match)
@@ -85,6 +94,24 @@ def parse_advisory(text: str) -> str:
     return "green"
 
 
+def parse_gate1(text: str) -> str:
+    """Parse gate1 verdict: green | yellow | red | decline.
+
+    Used by gate1 (start.md step 10's decline-detection scan). `decline` is the
+    /ceo-escalation signal and is ONLY ever a structured-line token — a free-form
+    mention of "decline" in the reasoning body must NOT trigger it. So the
+    heuristic fallback (no structured line) reuses the advisory classifier, which
+    can never return `decline`.
+    """
+    # Step 1: structured verdict line — last wins (includes decline)
+    matches = GATE1_PATTERN.findall(text)
+    if matches:
+        return matches[-1].lower()
+
+    # Step 2: no structured line — decline is never heuristic; defer to advisory.
+    return parse_advisory(text)
+
+
 def parse_binary(text: str) -> str:
     """Parse binary gate verdict: pass | fail.
 
@@ -138,6 +165,8 @@ def main() -> int:
             actual = parse_advisory(text)
         elif mode == "binary":
             actual = parse_binary(text)
+        elif mode == "gate1":
+            actual = parse_gate1(text)
         else:
             print(f"FAIL {filename}: unknown mode '{mode}'")
             failed += 1