From 68137659cfc3b5022f1a354dc822c348dec5d8d9 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 01/14] fix(dispatch): isolate reviewer exceptions + enforce
 roster policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

asyncio.gather now runs with return_exceptions=True and _dispatch_one is
wrapped in try/except, so one misconfigured reviewer (e.g. KeyError from a
bad aichat client key) can no longer cancel sibling reviewers or prevent
dispatch_summary.json from being written.

Wire resolve_roster + detect_host into roster parsing: disabled, tier,
privacy, custom_only and host_rules policies are now enforced on every
dispatch (previously resolve_roster had zero callers and the documented
claude-host auto-skip never happened). Explicitly-named reviewers count as
explicit_custom_only so custom-only entries stay reachable.

Confidence: high
Scope-risk: moderate
Directive: keep return_exceptions=True + per-reviewer try/except — one broken reviewer must never kill the run
Not-tested: live multi-reviewer dispatch with a real misconfigured route
---
 scripts/dispatch.py | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/scripts/dispatch.py b/scripts/dispatch.py
index 1c16f23..eb1a141 100644
--- a/scripts/dispatch.py
+++ b/scripts/dispatch.py
@@ -19,8 +19,9 @@
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from _common import (
     load_config, build_prompt, estimate_tokens, extract_json,
-    normalize_findings, ARGUS_HOME,
+    normalize_findings, resolve_roster,
 )
+from detect_host import detect as detect_host
 import adapters
 
 
@@ -91,7 +92,21 @@ async def _main_async(args) -> int:
     max_parallel = int(defaults["max_parallel"])
     ctx_safety = float(defaults["ctx_safety_ratio"])
 
-    roster = [r.strip() for r in args.roster.split(",") if r.strip()]
+    names = [r.strip() for r in args.roster.split(",") if r.strip()]
+    host, _ = detect_host()
+    # Explicitly-named reviewers count as explicit_custom_only so custom_only
+    # entries stay usable; disabled/tier/privacy/host_rules policies still apply.
+    roster, drops = resolve_roster(
+        cfg, "custom", names, host,
+        allow_free=bool(args.allow_free or defaults.get("allow_free")),
+        allow_logging=bool(args.allow_logging or defaults.get("allow_logging")),
+        explicit_custom_only=set(names),
+    )
+    for n, reason in drops:
+        sys.stderr.write(f"roster: dropped {n} — {reason}\n")
+    if not roster:
+        sys.stderr.write("roster: empty after policy filtering\n")
+        return 1
     diff = Path(args.diff).read_text(encoding="utf-8", errors="replace")
     prompt = build_prompt(diff, overlay=args.overlay)
 
@@ -121,11 +136,22 @@ async def _bounded(name: str) -> dict:
                 (reviews_dir / f"{name}.json").write_text(json.dumps(r, indent=2), encoding="utf-8")
                 return r
 
-            r = await _dispatch_one(name, spec, prompt, timeout)
+            try:
+                r = await _dispatch_one(name, spec, prompt, timeout)
+            except Exception as e:
+                r = {"name": name, "findings": [], "exit_code": 1,
+                     "error": f"{type(e).__name__}: {e}"}
             (reviews_dir / f"{name}.json").write_text(json.dumps(r, indent=2), encoding="utf-8")
             return r
 
-    results = await asyncio.gather(*[_bounded(n) for n in roster])
+    gathered = await asyncio.gather(*[_bounded(n) for n in roster], return_exceptions=True)
+    results = []
+    for n, r in zip(roster, gathered):
+        if isinstance(r, BaseException):
+            results.append({"name": n, "findings": [], "exit_code": 1,
+                            "error": f"{type(r).__name__}: {r}"})
+        else:
+            results.append(r)
 
     summary = {
         "roster": roster,
@@ -157,6 +183,8 @@ def main() -> int:
     ap.add_argument("--diff", required=True)
     ap.add_argument("--overlay", default=None, help="prompt overlay name (security|deep|...)")
     ap.add_argument("--timeout", type=int, default=None, help="override reviewer_timeout_sec for this run")
+    ap.add_argument("--allow-free", action="store_true", help="include free-tier reviewers")
+    ap.add_argument("--allow-logging", action="store_true", help="include reviewers with privacy: LOGS")
     args = ap.parse_args()
     return asyncio.run(_main_async(args))
 

From a4dc34ad6fb97e27a503de9cd3571e3b1ada2a46 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 02/14] fix(bench): zero-score failed calls instead of perfect
 F1 on clean-baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_bench_reviewer now scores any run with non-zero exit_code as P=R=F1=0
(a failed call is not "correctly found nothing"), and aggregate_bench's
_rescore_run skips the tp==fp==fn==0 -> 1.0 clean-baseline rule for runs
with an exit_code/error — previously wall-cap stubs were rescored from a
deliberate 0.0 back to a perfect 1.0. Also drop disabled reviewers from
profile-derived benchmark rosters (explicit --roster still forces them).

Constraint: clean-baseline 1.0 only for successful runs with zero findings
Confidence: high
Scope-risk: narrow
---
 scripts/aggregate_bench.py |  5 +++++
 scripts/benchmark.py       | 23 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/scripts/aggregate_bench.py b/scripts/aggregate_bench.py
index 528106b..23823ff 100644
--- a/scripts/aggregate_bench.py
+++ b/scripts/aggregate_bench.py
@@ -26,7 +26,12 @@ def _rescore_run(run: dict) -> dict:
     """Recompute P/R/F1 from tp/fp/fn with the clean-baseline rule (repairs pre-fix data).
 
     Clean-baseline rule: tp==fp==fn==0 → P=R=F1=1.0 (reviewer correctly found nothing).
+    Failed calls (non-zero exit_code or an error, e.g. wall-cap stubs) are
+    zero-scored — tp==fp==fn==0 there means "never ran", not "found nothing".
     """
+    if int(run.get("exit_code", 0) or 0) != 0 or run.get("error"):
+        run["precision"], run["recall"], run["f1"] = 0.0, 0.0, 0.0
+        return run
     tp = int(run.get("tp", 0) or 0)
     fp = int(run.get("fp", 0) or 0)
     fn = int(run.get("fn", 0) or 0)
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index fa2e676..1b9cf18 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -186,7 +186,13 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict],
             except Exception as e:
                 await _log_progress(f"{name:<16} {fx['name']:<18} run {idx+1}/{runs}  EXCEPTION: {type(e).__name__}: {str(e)[:80]}")
                 d = {"findings": [], "latency_sec": 0.0, "exit_code": 1, "error": f"{type(e).__name__}: {e}"}
-            scored = _score(d["findings"], fx["ground_truth"])
+            if d.get("exit_code", 1) != 0:
+                # A failed call is not "correctly found nothing" — zero-score it
+                # so broken reviewers can't earn F1=1.0 on clean-baseline.
+                scored = {"tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])),
+                          "precision": 0.0, "recall": 0.0, "f1": 0.0}
+            else:
+                scored = _score(d["findings"], fx["ground_truth"])
             run_data.append({
                 "run_idx": idx,
                 "n_findings": len(d["findings"]),
@@ -402,8 +408,19 @@ async def _main_async(args) -> int:
     else:
         roster = list(cfg["profiles"]["panel"]["members"])
 
-    # Filter out reviewers not in registry
-    roster = [n for n in roster if n in cfg["reviewers"]]
+    # Filter out reviewers not in registry; drop disabled ones unless the user
+    # named them explicitly via --roster (explicit naming = intent to test).
+    explicit = bool(args.roster)
+    kept = []
+    for n in roster:
+        if n not in cfg["reviewers"]:
+            print(f"skip {n}: not in registry", file=sys.stderr)
+            continue
+        if cfg["reviewers"][n].get("disabled") and not explicit:
+            print(f"skip {n}: disabled in config (name via --roster to force)", file=sys.stderr)
+            continue
+        kept.append(n)
+    roster = kept
 
     total_calls = len(roster) * len(fixtures) * args.runs
     print(f"Benchmarking {len(roster)} reviewers × {len(fixtures)} fixtures × {args.runs} runs = {total_calls} calls", file=sys.stderr)

From b530397438ba4a2ad9e88e7d7636d5fbd0f90782 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 03/14] fix(cost): honor 0/1/2 exit contract, add --yes-cost,
 reject unknown reviewers

The trailing return-1 inside the balance-check block made every default
invocation exit warn even at $0.00 (and --skip-balance-check masked real
warns as 0). Exit code is now computed once: 0 ok, 1 warn (threshold or OR
balance), 2 block. --yes-cost exists now and downgrades a block to a warn,
matching the message and SKILL.md. Unknown roster names fail fast with
exit 2 instead of being costed as $0 "paid CLI sub".

Rejected: removing the --yes-cost mention from the block message | docs and CLAUDE.md promise the override
Confidence: high
Scope-risk: narrow
---
 scripts/estimate_cost.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/scripts/estimate_cost.py b/scripts/estimate_cost.py
index 41b8d40..d3bfd1e 100644
--- a/scripts/estimate_cost.py
+++ b/scripts/estimate_cost.py
@@ -28,6 +28,7 @@ def main() -> int:
     ap.add_argument("--runs-per-fixture", type=int, default=1)
     ap.add_argument("--fixtures", type=int, default=1)
     ap.add_argument("--skip-balance-check", action="store_true", help="skip the OpenRouter balance pre-flight")
+    ap.add_argument("--yes-cost", action="store_true", help="downgrade a cost block to a warning (exit 1 instead of 2)")
     args = ap.parse_args()
 
     cfg = load_config()
@@ -39,6 +40,14 @@ def main() -> int:
     in_tokens = estimate_tokens(diff_text) + prompt_overhead
     roster = [r.strip() for r in args.roster.split(",") if r.strip()]
 
+    unknown = [n for n in roster if n not in cfg["reviewers"]]
+    if unknown:
+        sys.stderr.write(
+            f"ERROR: unknown reviewer(s): {', '.join(unknown)}. "
+            f"Known: {', '.join(sorted(cfg['reviewers']))}\n"
+        )
+        return 2
+
     rows = []
     total = 0.0
     per_unit_multiplier = args.runs_per_fixture * args.fixtures
@@ -78,11 +87,17 @@ def main() -> int:
     }
     print(json.dumps(out, indent=2))
 
+    rc = 0
     if total >= block:
-        sys.stderr.write(f"\nBLOCK: estimated ${total:.2f} exceeds hard cap ${block:.2f}. Pass --yes-cost to override.\n")
-        return 2
-    if total >= warn:
+        if args.yes_cost:
+            sys.stderr.write(f"\nWARN: estimated ${total:.2f} exceeds hard cap ${block:.2f} — overridden with --yes-cost.\n")
+            rc = 1
+        else:
+            sys.stderr.write(f"\nBLOCK: estimated ${total:.2f} exceeds hard cap ${block:.2f}. Pass --yes-cost to override.\n")
+            return 2
+    elif total >= warn:
         sys.stderr.write(f"\nWARN: estimated ${total:.2f} exceeds soft threshold ${warn:.2f}.\n")
+        rc = 1
 
     # OR balance pre-flight for non-dry invocations
     if not args.skip_balance_check:
@@ -99,12 +114,11 @@ def main() -> int:
                 available = info.get("available_usd")
                 safety = float(d.get("or_balance_safety_factor", 2.0))
                 if available is not None and available < total * safety:
-                    sys.stderr.write(f"\nOR BALANCE WARN: available ${available:.4f} < {safety}× estimate ${total:.4f}. Top up or use --yes-cost at dispatch time.\n")
-                    return max(1, 1)  # warn-level; dispatch will block via its own gate
+                    sys.stderr.write(f"\nOR BALANCE WARN: available ${available:.4f} < {safety}× estimate ${total:.4f}. Top up before dispatching.\n")
+                    rc = max(rc, 1)
             except Exception as e:
                 sys.stderr.write(f"OR balance check failed (non-fatal): {e}\n")
-        return 1
-    return 0
+    return rc
 
 
 if __name__ == "__main__":

From d26b5687ad158ca2869f52059fc68bf53942c290 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 04/14] fix(common): string-aware brace scan in extract_json;
 clamp off-enum severities

The balanced-brace fallback counted braces inside JSON string values, so a
finding description containing "}" truncated the chunk and dropped the whole
reviewer response as parse_error. The scanner now tracks in-string/escape
state. normalize_findings clamps severities outside the prompt enum to
medium so _to_gsd and SEVERITY_RANK consumers can no longer silently drop
or mis-rank them. Adds regression tests for both.

Confidence: high
Scope-risk: narrow
Not-tested: exotic reviewer outputs with unbalanced quotes outside JSON
---
 scripts/_common.py         | 21 ++++++++++++++++++---
 tests/test_extract_json.py | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/scripts/_common.py b/scripts/_common.py
index 77d1198..f41c785 100644
--- a/scripts/_common.py
+++ b/scripts/_common.py
@@ -103,9 +103,22 @@ def extract_json(text: str) -> dict | None:
             if start < 0:
                 continue
             depth = 0
+            in_str = False
+            esc = False
             for i in range(start, len(text)):
                 c = text[i]
-                if c == "{":
+                if in_str:
+                    # Braces inside string values must not move the depth counter.
+                    if esc:
+                        esc = False
+                    elif c == "\\":
+                        esc = True
+                    elif c == '"':
+                        in_str = False
+                    continue
+                if c == '"':
+                    in_str = True
+                elif c == "{":
                     depth += 1
                 elif c == "}":
                     depth -= 1
@@ -115,7 +128,6 @@ def extract_json(text: str) -> dict | None:
                             return json.loads(chunk)
                         except Exception:
                             break
-                        break
     return None
 
 
@@ -127,10 +139,13 @@ def normalize_findings(raw: Any) -> list[dict]:
         if not isinstance(f, dict):
             continue
         try:
+            sev = str(f.get("severity", "medium")).lower().strip()
+            if sev not in SEVERITY_RANK:
+                sev = "medium"
             out.append({
                 "file": str(f.get("file", "")).strip(),
                 "line": int(f.get("line", 0) or 0),
-                "severity": str(f.get("severity", "medium")).lower().strip(),
+                "severity": sev,
                 "category": str(f.get("category", "bug")).lower().strip(),
                 "description": str(f.get("description", "")).strip(),
                 "confidence": max(0, min(100, int(f.get("confidence", 50) or 50))),
diff --git a/tests/test_extract_json.py b/tests/test_extract_json.py
index 0961c2f..e5c3cfe 100644
--- a/tests/test_extract_json.py
+++ b/tests/test_extract_json.py
@@ -6,7 +6,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
 
-from _common import extract_json  # noqa: E402
+from _common import extract_json, normalize_findings  # noqa: E402
 
 
 def test_direct_json():
@@ -57,6 +57,34 @@ def test_nested_objects():
     assert r["findings"][0]["meta"]["nested"] is True
 
 
+def test_brace_inside_string_value():
+    """Braces inside string values must not break the balanced-brace scan."""
+    text = ('Here is my review: {"findings": [{"file": "a.py", "line": 1, '
+            '"description": "unmatched } and { braces in a format string"}]}')
+    r = extract_json(text)
+    assert isinstance(r, dict)
+    assert r["findings"][0]["file"] == "a.py"
+
+
+def test_escaped_quote_inside_string_value():
+    text = ('prose {"findings": [{"file": "a.py", '
+            '"description": "says \\"x}\\" here"}]}')
+    r = extract_json(text)
+    assert isinstance(r, dict)
+    assert "x}" in r["findings"][0]["description"]
+
+
+def test_normalize_findings_clamps_unknown_severity():
+    """Off-enum severities map to medium so no consumer silently drops them."""
+    raw = [
+        {"file": "a.py", "line": 1, "severity": "BLOCKER", "description": "x", "confidence": 90},
+        {"file": "b.py", "line": 2, "severity": "high", "description": "y", "confidence": 90},
+    ]
+    out = normalize_findings(raw)
+    assert out[0]["severity"] == "medium"
+    assert out[1]["severity"] == "high"
+
+
 if __name__ == "__main__":
     import traceback
     tests = [
@@ -67,6 +95,9 @@ def test_nested_objects():
         test_empty_input_returns_none,
         test_malformed_returns_none,
         test_nested_objects,
+        test_brace_inside_string_value,
+        test_escaped_quote_inside_string_value,
+        test_normalize_findings_clamps_unknown_severity,
     ]
     failed = 0
     for t in tests:

From 177b057b5845176d4925c319935f5d6041ced0f1 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 05/14] fix(adapter): copilot-cli pipes prompt via stdin
 (Windows ARG_MAX)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same fix as gemini-cli (91b2f2e): the full prompt left argv — where >32 KB
diffs hit the Windows CreateProcess command-line limit — and is now piped
on stdin with a short -p pointer. This also removes the broken redaction
(cmd[:6] kept the full prompt at index 2 while appending a decoy token).

Constraint: Copilot CLI programmatic mode combines piped stdin with -p
Confidence: medium
Scope-risk: narrow
Not-tested: live copilot CLI call (not installed in this environment); verify on Windows before relying on the copilot reviewer
---
 config.yaml                     |  6 +++---
 scripts/adapters/copilot_cli.py | 23 +++++++++--------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/config.yaml b/config.yaml
index 6f59a60..9d79643 100644
--- a/config.yaml
+++ b/config.yaml
@@ -293,6 +293,6 @@ cli_commands:
   codex-cli:    ["codex", "exec", "--skip-git-repo-check", "-"]   # stdin input; skip git-repo trust check
   claude-cli:   ["claude", "-p", "--output-format", "text", "--bare"]   # stdin
   opencode-cli: ["opencode", "run", "-"]                     # stdin input
-  copilot-cli:  ["copilot", "-p", "{prompt}", "--model", "{model}",
-                 "--allow-all-tools", "--no-color",
-                 "--output-format", "text"]                  # prompt-as-arg (~30KB Windows arg limit)
+  copilot-cli:  ["copilot", "-p", "Follow the review instructions provided on stdin.",
+                 "--model", "{model}", "--allow-all-tools", "--no-color",
+                 "--output-format", "text"]                  # prompt via STDIN (Windows ARG_MAX; same fix as gemini-cli)
diff --git a/scripts/adapters/copilot_cli.py b/scripts/adapters/copilot_cli.py
index 324e6d6..e29e704 100644
--- a/scripts/adapters/copilot_cli.py
+++ b/scripts/adapters/copilot_cli.py
@@ -1,9 +1,12 @@
 """GitHub Copilot CLI adapter.
 
 Copilot CLI uses GPT-family models (e.g., gpt-5.2) through the user's paid
-GitHub Copilot subscription. Invocation:
-    copilot -p "<prompt>" --model <model> --allow-all-tools \
-            --no-color --no-banner --output-format text
+GitHub Copilot subscription. The full prompt is piped via STDIN (combined by
+the CLI with the short -p pointer) — the previous {prompt}-in-argv approach
+breaks on Windows for prompts > ~32 KB (ARG_MAX limit), the same bug fixed
+for gemini-cli. Invocation:
+    copilot -p "<stdin pointer>" --model <model> --allow-all-tools \
+            --no-color --output-format text
 
 Value vs direct GPT routes:
   - Uses the user's Copilot paid sub (no API spend)
@@ -26,22 +29,14 @@ async def send(prompt: str, route_cfg: dict, timeout: int) -> dict:
     cfg = load_config()
     template = list(cfg["cli_commands"]["copilot-cli"])
     model = route_cfg.get("model", "")
-    cmd = []
-    for part in template:
-        if part == "{model}":
-            cmd.append(model)
-        elif part == "{prompt}":
-            cmd.append(prompt)
-        else:
-            cmd.append(part)
+    cmd = [model if part == "{model}" else part for part in template]
     env = os.environ.copy()
     env.setdefault("COPILOT_ALLOW_ALL", "1")
-    # Prompt already embedded as arg; no stdin needed
-    rc, stdout, stderr, dt = await run_subprocess(cmd, "", timeout, env=env)
+    rc, stdout, stderr, dt = await run_subprocess(cmd, prompt, timeout, env=env)
     return {
         "route": "copilot-cli",
         "model": model,
-        "cmd": cmd[:6] + ["...", "<prompt>"] if len(cmd) > 6 else cmd,
+        "cmd": cmd,
         "exit_code": rc,
         "stdout": stdout,
         "stderr": stderr,

From abd6f144e6b5534d496d0b0c5e995f4dbdbae9d4 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 01:52:14 -0700
Subject: [PATCH 06/14] fix(stats,host): --since works on benchmarks table;
 codex detection markers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stats.py compared dashed ISO --since values against the benchmarks table's
compact %Y%m%dT%H%M%S timestamps, where '0' > '-' at index 4 made every
2026+ row pass any cutoff. Both sides are now normalized (strip -/:) before
the lexical compare. detect_host.py no longer treats any CODEX_* env var as
proof of a Codex host — a user's exported CODEX_API_KEY misrouted the
roster; detection now matches specific session markers only.

Confidence: high
Scope-risk: narrow
Not-tested: exact env markers a live Codex CLI session sets; parent-process walk remains the backstop
---
 scripts/detect_host.py | 9 +++++++--
 scripts/stats.py       | 7 +++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/detect_host.py b/scripts/detect_host.py
index ff08e73..2a3798b 100644
--- a/scripts/detect_host.py
+++ b/scripts/detect_host.py
@@ -30,8 +30,13 @@ def detect() -> tuple[str, list[str]]:
     if "CLAUDE_CODE_SESSION" in env or "CLAUDE_CODE_SIMPLE" in env:
         signals.append("env:CLAUDE_CODE_*")
         return "claude", signals
-    if any(k.startswith("CODEX_") for k in env):
-        signals.append("env:CODEX_*")
+    # Specific session markers only — a prefix scan would misfire on user
+    # credentials like CODEX_API_KEY exported in a shell profile.
+    codex_markers = ("CODEX_SANDBOX", "CODEX_SANDBOX_NETWORK_DISABLED",
+                     "CODEX_THREAD_ID", "CODEX_SESSION_ID")
+    codex_hit = next((k for k in codex_markers if k in env), None)
+    if codex_hit:
+        signals.append(f"env:{codex_hit}")
         return "codex", signals
     if env.get("GEMINI_CLI") == "1" or "GEMINI_CLI_SESSION" in env:
         signals.append("env:GEMINI_CLI")
diff --git a/scripts/stats.py b/scripts/stats.py
index 9c9efd0..54f4d41 100644
--- a/scripts/stats.py
+++ b/scripts/stats.py
@@ -31,8 +31,11 @@ def main() -> int:
     where = ""
     params: list = []
     if args.since:
-        where = "WHERE ts >= ?"
-        params = [args.since]
+        # runs.ts is dashed ISO-8601 ('2026-06-10T12:00:00+00:00') while
+        # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' from
+        # both sides so the lexical compare is valid for either format.
+        where = "WHERE REPLACE(REPLACE(ts, '-', ''), ':', '') >= ?"
+        params = [args.since.replace("-", "").replace(":", "")]
 
     # Per-reviewer aggregates from reviewer_runs
     cur.execute(f"""

From 3dc1e3cfc7b804d1cf2b2c5611dec96389ecba17 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 02:15:07 -0700
Subject: [PATCH 07/14] fix(roster): explicit --roster semantics; one policy
 engine for dispatch+benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-1's resolve_roster wiring applied profile-grade policy to explicit
rosters: explicitly-named disabled reviewers were hard-dropped with no force
option, and host_rules add injected claude into every dispatch — under the
SKILL.md subagent-per-reviewer pattern that meant K concurrent claude calls
racing on reviews/claude.json. resolve_roster gains explicit=True: named
reviewers waive disabled/custom_only, host add is not applied; host skip and
tier/privacy gates still hold. benchmark.py now uses the same resolver
(profiles get full policy incl. the claude-host auto-skip), dispatch records
drops in dispatch_summary.json, and estimate_cost's unknown-reviewer error
is labeled INVALID ROSTER so it cannot be read as a cost BLOCK.

Constraint: explicit naming = intent; host skip is never waived (nested-invocation safety)
Rejected: keeping benchmark's inline filter | third divergent policy copy was the bug
Confidence: high
Scope-risk: moderate
Directive: roster policy lives ONLY in resolve_roster — never add inline filters
---
 scripts/_common.py       | 142 +++++++++++++++++++++++++++++++--------
 scripts/benchmark.py     |  71 +++++++++++---------
 scripts/dispatch.py      |  12 ++--
 scripts/estimate_cost.py |   9 +--
 4 files changed, 166 insertions(+), 68 deletions(-)

diff --git a/scripts/_common.py b/scripts/_common.py
index f41c785..ae4adc5 100644
--- a/scripts/_common.py
+++ b/scripts/_common.py
@@ -2,9 +2,11 @@
 from __future__ import annotations
 
 import asyncio
+import functools
 import json
 import os
 import re
+import signal
 import sqlite3
 import subprocess
 import sys
@@ -28,19 +30,27 @@
 SEVERITY_RANK = {"critical": 0, "high": 1, "medium": 2, "low": 3}
 
 
+@functools.lru_cache(maxsize=8)
 def load_config(path: Path | None = None) -> dict:
+    """Parse config.yaml once per process. Returns a shared cached dict —
+    treat it as read-only (every adapter call goes through here)."""
     p = path or CONFIG_PATH
     with open(p, "r", encoding="utf-8") as f:
         return yaml.safe_load(f)
 
 
+@functools.lru_cache(maxsize=16)
+def _read_text_cached(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
 def build_prompt(diff: str, overlay: str | None = None) -> str:
-    tpl = PROMPT_PATH.read_text(encoding="utf-8")
+    tpl = _read_text_cached(PROMPT_PATH)
     overlay_text = ""
     if overlay:
         overlay_path = OVERLAYS_DIR / f"{overlay}.md"
         if overlay_path.exists():
-            overlay_text = overlay_path.read_text(encoding="utf-8")
+            overlay_text = _read_text_cached(overlay_path)
     # Escape ``` in the diff so it cannot terminate the outer fence early.
     # Use a zero-width joiner between backticks; reviewers see visually-identical
     # content while the fence stays intact.
@@ -97,8 +107,35 @@ def extract_json(text: str) -> dict | None:
                 continue
             start_range = [text.rfind("{", 0, idx)]
         else:
-            # Try every `{` from largest block first
-            start_range = [i for i, c in enumerate(text) if c == "{"]
+            # Last resort: one O(n) string-aware pass collecting every balanced
+            # {...} span via a stack, then bounded parse attempts in document
+            # order. (The old per-'{' rescan was O(n²) — minutes of CPU on
+            # brace-heavy non-JSON reviewer output.)
+            spans: list[tuple[int, int]] = []
+            stack: list[int] = []
+            in_str = esc = False
+            for i, c in enumerate(text):
+                if in_str:
+                    if esc:
+                        esc = False
+                    elif c == "\\":
+                        esc = True
+                    elif c == '"':
+                        in_str = False
+                    continue
+                if c == '"':
+                    in_str = True
+                elif c == "{":
+                    stack.append(i)
+                elif c == "}" and stack:
+                    spans.append((stack.pop(), i))
+            spans.sort()
+            for start, end in spans[:50]:
+                try:
+                    return json.loads(text[start : end + 1])
+                except Exception:
+                    continue
+            return None
         for start in start_range:
             if start < 0:
                 continue
@@ -155,6 +192,12 @@ def normalize_findings(raw: Any) -> list[dict]:
     return out
 
 
+@functools.lru_cache(maxsize=32)
+def _which_cached(name: str) -> str | None:
+    import shutil
+    return shutil.which(name)
+
+
 def _resolve_cmd(cmd: list[str]) -> list[str]:
     """On Windows, npm-installed shims are `.cmd` files that bash finds via PATH
     but Python's subprocess (with shell=False) does not. Use shutil.which to
@@ -162,46 +205,76 @@ def _resolve_cmd(cmd: list[str]) -> list[str]:
     """
     if not cmd:
         return cmd
-    import shutil
-    resolved = shutil.which(cmd[0])
+    resolved = _which_cached(cmd[0])
     if resolved:
         return [resolved] + cmd[1:]
     return cmd
 
 
+def _kill_tree(proc: subprocess.Popen) -> None:
+    """Kill the process AND its children. npm .cmd shims spawn a node grandchild
+    that survives a plain proc.kill(), holds the captured pipes, and hangs the
+    run — the bug that got the gemini reviewer disabled."""
+    try:
+        if os.name == "nt":
+            subprocess.run(["taskkill", "/T", "/F", "/PID", str(proc.pid)],
+                           capture_output=True, check=False)
+        else:
+            os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+    except Exception:
+        try:
+            proc.kill()
+        except Exception:
+            pass
+
+
 async def run_subprocess(cmd: list[str], stdin_data: str, timeout: int,
                          env: dict | None = None, cwd: str | None = None) -> tuple[int, str, str, float]:
     """Run a subprocess with stdin + timeout. Returns (rc, stdout, stderr, elapsed_sec).
 
-    Uses subprocess.run inside a thread for simplicity and Windows compatibility.
-    Resolves argv[0] via shutil.which so npm/.cmd shims work.
+    Runs in a thread for simplicity and Windows compatibility. Resolves argv[0]
+    via shutil.which so npm/.cmd shims work. On timeout the whole process TREE
+    is killed (POSIX: own session + killpg; Windows: taskkill /T /F).
     """
     resolved_cmd = _resolve_cmd(cmd)
 
     def _runit() -> tuple[int, str, str, float]:
         t0 = time.monotonic()
+        popen_kwargs: dict = {}
+        if os.name != "nt":
+            popen_kwargs["start_new_session"] = True
         try:
-            result = subprocess.run(
+            proc = subprocess.Popen(
                 resolved_cmd,
-                input=stdin_data.encode("utf-8"),
-                capture_output=True,
-                timeout=timeout,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
                 env=env,
                 cwd=cwd,
-                check=False,
                 shell=False,
+                **popen_kwargs,
             )
+        except FileNotFoundError as e:
+            return 127, "", f"command not found: {e}", time.monotonic() - t0
+        except Exception as e:
+            return 1, "", f"launch error: {type(e).__name__}: {e}", time.monotonic() - t0
+        try:
+            out, err = proc.communicate(input=stdin_data.encode("utf-8"), timeout=timeout)
             return (
-                result.returncode,
-                result.stdout.decode("utf-8", errors="replace"),
-                result.stderr.decode("utf-8", errors="replace"),
+                proc.returncode,
+                out.decode("utf-8", errors="replace"),
+                err.decode("utf-8", errors="replace"),
                 time.monotonic() - t0,
             )
         except subprocess.TimeoutExpired:
-            return 124, "", "timeout", time.monotonic() - t0
-        except FileNotFoundError as e:
-            return 127, "", f"command not found: {e}", time.monotonic() - t0
+            _kill_tree(proc)
+            try:
+                proc.communicate(timeout=5)
+            except Exception:
+                pass
+            return 124, "", "timeout (process tree killed)", time.monotonic() - t0
         except Exception as e:
+            _kill_tree(proc)
             return 1, "", f"launch error: {type(e).__name__}: {e}", time.monotonic() - t0
 
     return await asyncio.to_thread(_runit)
@@ -209,8 +282,16 @@ def _runit() -> tuple[int, str, str, float]:
 
 def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str,
                    allow_free: bool = False, allow_logging: bool = False,
-                   explicit_custom_only: set[str] | None = None) -> tuple[list[str], list[tuple[str, str]]]:
-    """Return (final_roster, drop_reasons)."""
+                   explicit_custom_only: set[str] | None = None,
+                   explicit: bool = False) -> tuple[list[str], list[tuple[str, str]]]:
+    """Return (final_roster, drop_reasons).
+
+    explicit=True means the caller passed these exact names by hand
+    (dispatch/benchmark --roster): disabled and custom_only gates are waived
+    (explicit naming = intent) and host_rules `add` does not inject reviewers
+    the caller never asked for. host_rules `skip` and the tier/privacy gates
+    still apply — they guard external consequences, not reviewer health.
+    """
     explicit_custom_only = explicit_custom_only or set()
     reviewers = cfg["reviewers"]
     profiles = cfg["profiles"]
@@ -224,19 +305,24 @@ def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str,
         base = list(profiles[cfg["defaults"]["profile"]]["members"])
 
     # Host adaptation
-    base = [n for n in base if n not in set(host_rules.get("skip", []))]
-    for n in host_rules.get("add", []):
-        if n not in base:
-            base.append(n)
-
+    skipped_by_host = set(host_rules.get("skip", []))
     drops: list[tuple[str, str]] = []
+    for n in base:
+        if n in skipped_by_host:
+            drops.append((n, f"host rule (running inside {host})"))
+    base = [n for n in base if n not in skipped_by_host]
+    if not explicit:
+        for n in host_rules.get("add", []):
+            if n not in base:
+                base.append(n)
+
     final = []
     for n in base:
         spec = reviewers.get(n)
         if not spec:
             drops.append((n, "not in registry"))
             continue
-        if spec.get("disabled"):
+        if spec.get("disabled") and not explicit:
             drops.append((n, "disabled in config"))
             continue
         if spec.get("tier") == "free" and not allow_free:
@@ -245,7 +331,7 @@ def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str,
         if spec.get("privacy") == "LOGS" and not allow_logging:
             drops.append((n, "logs prompts (use --allow-logging)"))
             continue
-        if spec.get("custom_only") and n not in explicit_custom_only:
+        if spec.get("custom_only") and not explicit and n not in explicit_custom_only:
             drops.append((n, "custom-only (name via --custom/--models)"))
             continue
         final.append(n)
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 1b9cf18..dfb3d32 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -17,19 +17,18 @@
 
 import argparse
 import asyncio
-import hashlib
 import json
 import os
 import sys
 import time
-from collections import defaultdict
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from _common import (
     load_config, build_prompt, extract_json, normalize_findings,
-    ARGUS_HOME, history_conn,
+    ARGUS_HOME, history_conn, resolve_roster,
 )
+from detect_host import detect as detect_host
 import adapters
 
 
@@ -122,6 +121,8 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict:
     if adapter is None:
         return {"findings": [], "latency_sec": 0.0, "primary_latency_sec": 0.0,
                 "fallback_latency_sec": 0.0, "fallback_used": False,
+                "exit_code": 1, "primary_exit_code": 1, "primary_error": "",
+                "parse_error": False,
                 "error": f"no adapter for {primary.get('route')}"}
     r = await adapter.send(prompt, primary, timeout)
     primary_latency = r.get("latency_sec", 0.0)
@@ -140,10 +141,13 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict:
                 fallback_used = True
                 r = r2
     findings = []
+    parse_error = False
     if r["exit_code"] == 0:
         parsed = extract_json(r["stdout"])
         if isinstance(parsed, dict):
             findings = normalize_findings(parsed.get("findings", []))
+        else:
+            parse_error = True
     total_latency = primary_latency + fallback_latency
     return {
         "findings": findings,
@@ -154,6 +158,7 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict:
         "exit_code": r["exit_code"],
         "primary_exit_code": primary_exit,
         "primary_error": primary_err,
+        "parse_error": parse_error,
         "error": (r.get("stderr") or "")[:200] if r["exit_code"] != 0 else None,
     }
 
@@ -166,7 +171,7 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict],
     wall_start = time.monotonic()
 
     for fx in fixtures:
-        prompt = build_prompt(fx["diff"])
+        prompt = fx["prompt"]
         run_data = []
         merged_keys: set[tuple[str, int]] = set()
         for idx in range(runs):
@@ -174,7 +179,7 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict],
                 await _log_progress(f"{name:<16} WALL-CAP HIT at {int(time.monotonic()-wall_start)}s — skipping remaining runs")
                 run_data.append({
                     "run_idx": idx, "n_findings": 0, "latency_sec": 0.0,
-                    "exit_code": 137, "error": "wall-cap exceeded",
+                    "exit_code": 137, "parse_error": False, "error": "wall-cap exceeded",
                     "tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])),
                     "precision": 0.0, "recall": 0.0, "f1": 0.0,
                 })
@@ -185,10 +190,11 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict],
                     d = await _dispatch(name, spec, prompt, timeout)
             except Exception as e:
                 await _log_progress(f"{name:<16} {fx['name']:<18} run {idx+1}/{runs}  EXCEPTION: {type(e).__name__}: {str(e)[:80]}")
-                d = {"findings": [], "latency_sec": 0.0, "exit_code": 1, "error": f"{type(e).__name__}: {e}"}
-            if d.get("exit_code", 1) != 0:
-                # A failed call is not "correctly found nothing" — zero-score it
-                # so broken reviewers can't earn F1=1.0 on clean-baseline.
+                d = {"findings": [], "latency_sec": 0.0, "exit_code": 1,
+                     "parse_error": False, "error": f"{type(e).__name__}: {e}"}
+            if d.get("exit_code", 1) != 0 or d.get("parse_error"):
+                # A failed or unparseable call is not "correctly found nothing" —
+                # zero-score it so broken reviewers can't earn F1=1.0 on clean-baseline.
                 scored = {"tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])),
                           "precision": 0.0, "recall": 0.0, "f1": 0.0}
             else:
@@ -197,8 +203,9 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict],
                 "run_idx": idx,
                 "n_findings": len(d["findings"]),
                 "latency_sec": d["latency_sec"],
-                "exit_code": d["exit_code"],
-                "error": d["error"],
+                "exit_code": d.get("exit_code", 1),
+                "parse_error": d.get("parse_error", False),
+                "error": d.get("error"),
                 **scored,
             })
             for f in d["findings"]:
@@ -401,26 +408,24 @@ async def _main_async(args) -> int:
         sys.stderr.write("no fixtures found. Seed fixtures/ before running benchmark.\n")
         return 1
 
+    # Same policy engine as dispatch.py: --roster names are explicit (disabled/
+    # custom_only waived, no host_rules add); profiles get full policy.
+    host, _ = detect_host()
     if args.roster:
-        roster = [r.strip() for r in args.roster.split(",") if r.strip()]
-    elif args.profile:
-        roster = list(cfg["profiles"][args.profile]["members"])
+        names = [r.strip() for r in args.roster.split(",") if r.strip()]
+        roster, drops = resolve_roster(
+            cfg, "custom", names, host,
+            allow_free=args.allow_free, allow_logging=args.allow_logging,
+            explicit=True,
+        )
     else:
-        roster = list(cfg["profiles"]["panel"]["members"])
-
-    # Filter out reviewers not in registry; drop disabled ones unless the user
-    # named them explicitly via --roster (explicit naming = intent to test).
-    explicit = bool(args.roster)
-    kept = []
-    for n in roster:
-        if n not in cfg["reviewers"]:
-            print(f"skip {n}: not in registry", file=sys.stderr)
-            continue
-        if cfg["reviewers"][n].get("disabled") and not explicit:
-            print(f"skip {n}: disabled in config (name via --roster to force)", file=sys.stderr)
-            continue
-        kept.append(n)
-    roster = kept
+        profile = args.profile or "panel"
+        roster, drops = resolve_roster(
+            cfg, "profile", [profile], host,
+            allow_free=args.allow_free, allow_logging=args.allow_logging,
+        )
+    for n, reason in drops:
+        print(f"skip {n}: {reason}", file=sys.stderr)
 
     total_calls = len(roster) * len(fixtures) * args.runs
     print(f"Benchmarking {len(roster)} reviewers × {len(fixtures)} fixtures × {args.runs} runs = {total_calls} calls", file=sys.stderr)
@@ -487,7 +492,11 @@ async def _main_async(args) -> int:
     _PROGRESS_COUNTER["done"] = 0
     _PROGRESS_START = time.monotonic()
 
-    ts = args.benchmark_ts or time.strftime("%Y%m%dT%H%M%S")
+    # UTC so benchmarks.ts and runs.ts (also UTC) share one --since timeline.
+    ts = args.benchmark_ts or time.strftime("%Y%m%dT%H%M%S", time.gmtime())
+    # Prompts are reviewer-independent — build each fixture's once.
+    for fx in fixtures:
+        fx["prompt"] = build_prompt(fx["diff"])
     sem = asyncio.Semaphore(max_parallel)
     tasks = [_bench_reviewer(n, cfg["reviewers"][n], fixtures, args.runs, timeout, sem,
                               ts=ts, max_wall_sec=args.max_wall_sec) for n in roster]
@@ -527,6 +536,8 @@ def main() -> int:
     ap.add_argument("--max-wall-sec", type=int, default=600, help="hard wall-time cap per reviewer (default 600s)")
     ap.add_argument("--yes-cost", action="store_true", help="bypass benchmark cost block + OR balance gate (env: ARGUS_YES_COST=1)")
     ap.add_argument("--skip-balance-check", action="store_true", help="skip the OpenRouter balance pre-flight")
+    ap.add_argument("--allow-free", action="store_true", help="include free-tier reviewers")
+    ap.add_argument("--allow-logging", action="store_true", help="include reviewers with privacy: LOGS")
     args = ap.parse_args()
     return asyncio.run(_main_async(args))
 
diff --git a/scripts/dispatch.py b/scripts/dispatch.py
index eb1a141..0e7ca8a 100644
--- a/scripts/dispatch.py
+++ b/scripts/dispatch.py
@@ -94,13 +94,13 @@ async def _main_async(args) -> int:
 
     names = [r.strip() for r in args.roster.split(",") if r.strip()]
     host, _ = detect_host()
-    # Explicitly-named reviewers count as explicit_custom_only so custom_only
-    # entries stay usable; disabled/tier/privacy/host_rules policies still apply.
+    # --roster names are explicit: disabled/custom_only are waived and
+    # host_rules `add` is not applied; host `skip` and tier/privacy still hold.
     roster, drops = resolve_roster(
         cfg, "custom", names, host,
         allow_free=bool(args.allow_free or defaults.get("allow_free")),
         allow_logging=bool(args.allow_logging or defaults.get("allow_logging")),
-        explicit_custom_only=set(names),
+        explicit=True,
     )
     for n, reason in drops:
         sys.stderr.write(f"roster: dropped {n} — {reason}\n")
@@ -120,9 +120,8 @@ async def _main_async(args) -> int:
 
     async def _bounded(name: str) -> dict:
         async with sem:
-            spec = cfg["reviewers"].get(name)
-            if not spec:
-                return {"name": name, "error": "unknown reviewer"}
+            # resolve_roster guarantees roster ⊆ registry
+            spec = cfg["reviewers"][name]
 
             # Context-window pre-check
             ctx = int(spec.get("ctx", 0) or 0)
@@ -155,6 +154,7 @@ async def _bounded(name: str) -> dict:
 
     summary = {
         "roster": roster,
+        "dropped": {n: reason for n, reason in drops},
         "prompt_tokens_est": prompt_tokens,
         "reviewers": {
             r["name"]: {
diff --git a/scripts/estimate_cost.py b/scripts/estimate_cost.py
index d3bfd1e..e7a8796 100644
--- a/scripts/estimate_cost.py
+++ b/scripts/estimate_cost.py
@@ -12,6 +12,7 @@
 
 import argparse
 import json
+import os
 import sys
 from pathlib import Path
 
@@ -43,7 +44,8 @@ def main() -> int:
     unknown = [n for n in roster if n not in cfg["reviewers"]]
     if unknown:
         sys.stderr.write(
-            f"ERROR: unknown reviewer(s): {', '.join(unknown)}. "
+            f"INVALID ROSTER (not a cost block — --yes-cost will not help): "
+            f"unknown reviewer(s): {', '.join(unknown)}. "
             f"Known: {', '.join(sorted(cfg['reviewers']))}\n"
         )
         return 2
@@ -106,11 +108,10 @@ def main() -> int:
             or (cfg["reviewers"].get(name, {}).get("fallback", {}).get("client") == "openrouter")
             for name in roster
         )
-        import os as _os
-        if uses_openrouter and _os.environ.get("OPENROUTER_API_KEY"):
+        if uses_openrouter and os.environ.get("OPENROUTER_API_KEY"):
             try:
                 from or_balance import probe
-                info = probe(_os.environ["OPENROUTER_API_KEY"])
+                info = probe(os.environ["OPENROUTER_API_KEY"])
                 available = info.get("available_usd")
                 safety = float(d.get("or_balance_safety_factor", 2.0))
                 if available is not None and available < total * safety:

From 3cc58fbcc26d2fbdfa0fca5471c77ad26d71f018 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 02:15:07 -0700
Subject: [PATCH 08/14] fix(bench): zero-score unparseable output; no-adapter
 KeyError; UTC ts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A reviewer exiting 0 with unparseable prose scored F1=1.0 on clean-baseline
through the parse-failure path the round-1 gate missed — _dispatch now
records parse_error, and both the scoring gate and aggregate_bench's
_rescore_run treat it as a failed call. The no-adapter early return gains
exit_code so run_data.append no longer KeyErrors (which made misconfigured
reviewers vanish from the leaderboard as fatal stubs). benchmarks.ts is now
UTC so --since shares one timeline with runs.ts; fixture prompts are built
once instead of per reviewer.

Confidence: high
Scope-risk: narrow
Not-tested: pre-existing benchmarks rows keep local-time ts (error bounded by UTC offset)
---
 scripts/aggregate_bench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/aggregate_bench.py b/scripts/aggregate_bench.py
index 23823ff..fb96361 100644
--- a/scripts/aggregate_bench.py
+++ b/scripts/aggregate_bench.py
@@ -26,10 +26,10 @@ def _rescore_run(run: dict) -> dict:
     """Recompute P/R/F1 from tp/fp/fn with the clean-baseline rule (repairs pre-fix data).
 
     Clean-baseline rule: tp==fp==fn==0 → P=R=F1=1.0 (reviewer correctly found nothing).
-    Failed calls (non-zero exit_code or an error, e.g. wall-cap stubs) are
+    Failed calls (non-zero exit_code, an error, or unparseable output) are
     zero-scored — tp==fp==fn==0 there means "never ran", not "found nothing".
     """
-    if int(run.get("exit_code", 0) or 0) != 0 or run.get("error"):
+    if int(run.get("exit_code", 0) or 0) != 0 or run.get("error") or run.get("parse_error"):
         run["precision"], run["recall"], run["f1"] = 0.0, 0.0, 0.0
         return run
     tp = int(run.get("tp", 0) or 0)

From f560d7c339b9f52a2452bc81b8c543d1da530731 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 02:15:07 -0700
Subject: [PATCH 09/14] fix(common): kill process tree on timeout; O(n)
 extract_json fallback; cache config/template/which
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run_subprocess now runs children in their own session and kills the whole
tree on timeout (killpg on POSIX, taskkill /T /F on Windows) — the root
cause behind the disabled gemini reviewer's .cmd-shim hang; verified by a
test that confirms the orphaned grandchild dies. extract_json's last-resort
fallback is one string-aware pass collecting balanced spans (was O(n²) —
measured 19s on 24KB of brace garbage; now bounded at 50 parse attempts).
load_config, the prompt template read, and shutil.which are lru_cached
(~36ms YAML parse per adapter call removed; no caller mutates the cfg dict).

Constraint: load_config returns a shared dict — treat as read-only
Confidence: high
Scope-risk: moderate
Directive: gemini reviewer stays disabled until the tree-kill is verified on Windows
Not-tested: taskkill path (Windows); span-capped fallback on >50 decoy spans before the real object
---
 tests/test_extract_json.py   | 18 ++++++++
 tests/test_run_subprocess.py | 82 ++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 tests/test_run_subprocess.py

diff --git a/tests/test_extract_json.py b/tests/test_extract_json.py
index e5c3cfe..0185a52 100644
--- a/tests/test_extract_json.py
+++ b/tests/test_extract_json.py
@@ -74,6 +74,22 @@ def test_escaped_quote_inside_string_value():
     assert "x}" in r["findings"][0]["description"]
 
 
+def test_inner_object_recovered_when_outer_braces_invalid():
+    """Last-resort span scan finds a nested valid object inside brace garbage."""
+    text = 'junk { junk {"a": 1} junk }'
+    assert extract_json(text) == {"a": 1}
+
+
+def test_brace_garbage_returns_none_fast():
+    """O(n) span fallback: brace-heavy non-JSON must not take quadratic time."""
+    import time
+    garbage = ("{ x " * 4000) + ("} y " * 4000)
+    t0 = time.monotonic()
+    r = extract_json(garbage)
+    assert time.monotonic() - t0 < 2.0
+    assert r is None
+
+
 def test_normalize_findings_clamps_unknown_severity():
     """Off-enum severities map to medium so no consumer silently drops them."""
     raw = [
@@ -97,6 +113,8 @@ def test_normalize_findings_clamps_unknown_severity():
         test_nested_objects,
         test_brace_inside_string_value,
         test_escaped_quote_inside_string_value,
+        test_inner_object_recovered_when_outer_braces_invalid,
+        test_brace_garbage_returns_none_fast,
         test_normalize_findings_clamps_unknown_severity,
     ]
     failed = 0
diff --git a/tests/test_run_subprocess.py b/tests/test_run_subprocess.py
new file mode 100644
index 0000000..c1d972a
--- /dev/null
+++ b/tests/test_run_subprocess.py
@@ -0,0 +1,82 @@
+"""Unit tests for scripts/_common.py::run_subprocess — timeout + tree-kill."""
+from __future__ import annotations
+
+import asyncio
+import os
+import shutil
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
+
+from _common import run_subprocess  # noqa: E402
+
+HAS_BASH = shutil.which("bash") is not None
+
+
+def test_success_passes_stdin_and_captures_stdout():
+    if not HAS_BASH:
+        return
+    rc, out, err, dt = asyncio.run(run_subprocess(["bash", "-c", "cat"], "hello", 10))
+    assert rc == 0
+    assert out == "hello"
+
+
+def test_timeout_returns_124_promptly():
+    if not HAS_BASH:
+        return
+    t0 = time.monotonic()
+    rc, out, err, dt = asyncio.run(run_subprocess(["bash", "-c", "sleep 30"], "", 1))
+    assert rc == 124
+    assert "timeout" in err
+    assert time.monotonic() - t0 < 10  # killed, not waited out
+
+
+def test_timeout_kills_grandchildren():
+    """A shell that spawns a sleeping child must not leave the child running
+    after timeout — the .cmd-shim hang that got the gemini reviewer disabled."""
+    if not HAS_BASH or os.name == "nt":
+        return
+    import tempfile
+    pidfile = tempfile.NamedTemporaryFile(suffix=".pid", delete=False).name
+    try:
+        rc, out, err, dt = asyncio.run(
+            run_subprocess(["bash", "-c", f"sleep 30 & echo $! > {pidfile}; wait"], "", 1)
+        )
+        assert rc == 124
+        child_pid = int(Path(pidfile).read_text().strip())
+        time.sleep(0.5)  # let init reap the reparented child
+        try:
+            os.kill(child_pid, 0)
+            alive = Path(f"/proc/{child_pid}/stat").read_text().split()[2] != "Z"
+        except (ProcessLookupError, FileNotFoundError):
+            alive = False
+        assert not alive, f"grandchild {child_pid} survived the tree kill"
+    finally:
+        os.unlink(pidfile)
+
+
+def test_command_not_found_returns_127():
+    rc, out, err, dt = asyncio.run(run_subprocess(["argus-no-such-binary-xyz"], "", 5))
+    assert rc == 127
+
+
+if __name__ == "__main__":
+    import traceback
+    tests = [
+        test_success_passes_stdin_and_captures_stdout,
+        test_timeout_returns_124_promptly,
+        test_timeout_kills_grandchildren,
+        test_command_not_found_returns_127,
+    ]
+    failed = 0
+    for t in tests:
+        try:
+            t()
+            print(f"PASS  {t.__name__}")
+        except Exception as e:
+            failed += 1
+            print(f"FAIL  {t.__name__}: {type(e).__name__}: {e}")
+            traceback.print_exc()
+    sys.exit(failed)

From c7b940802bee7d6a20f031f2e311adf4ec7e123c Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 02:15:07 -0700
Subject: [PATCH 10/14] chore: dead-code sweep; config-driven merge line
 tolerance; argv[0]-scoped host detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

merge.py: drop unused uuid import and the phantom cfg param on
_record_history; replace the max(-key) double negation with min. benchmark
dead imports removed in the roster commit. detect_host: NAME_HINTS flattened
to a tuple, and the process walk matches argv[0]/argv[1] basenames instead
of the full cmdline — a wrapping shell containing "--roster codex" no longer
detects as a codex host. merge clustering tolerance is now
defaults.merge_line_tolerance / --line-tolerance, matching its sibling knobs.

Confidence: high
Scope-risk: narrow
---
 config.yaml            |  3 ++-
 scripts/detect_host.py | 23 +++++++++++------------
 scripts/merge.py       | 11 ++++++-----
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/config.yaml b/config.yaml
index 9d79643..7bf76fb 100644
--- a/config.yaml
+++ b/config.yaml
@@ -23,6 +23,7 @@ defaults:
   allow_logging: false            # require --allow-logging to include privacy: LOGS
   confidence_threshold: 80        # drop findings with effective confidence < this
   corroboration_boost: 15         # confidence bonus when 2+ reviewers agree on same file:line
+  merge_line_tolerance: 3         # cluster findings within +/- this many lines (merge.py)
   ctx_safety_ratio: 0.70          # skip reviewer if prompt_tokens > ctx * this
   reviewer_timeout_sec: 360       # bumped from 180 on 2026-05-06 after a 4-doc
                                   # ~80K-token bundle had 5/7 reviewers timeout.
@@ -141,7 +142,7 @@ reviewers:
     ctx: 2000000
     tier: paid
     privacy: ok
-    disabled: true      # on hold — use gemini-or instead
+    disabled: true      # on hold — run_subprocess now tree-kills on timeout; re-test on Windows before re-enabling (use gemini-or meanwhile)
     cost_per_m: null
     primary:  { route: gemini-cli }
     fallback: { route: aichat, client: openrouter, model: "google/gemini-3.1-pro-preview" }
diff --git a/scripts/detect_host.py b/scripts/detect_host.py
index 2a3798b..256cb23 100644
--- a/scripts/detect_host.py
+++ b/scripts/detect_host.py
@@ -11,12 +11,7 @@
 import sys
 
 
-NAME_HINTS = {
-    "claude": ["claude"],
-    "codex":  ["codex"],
-    "gemini": ["gemini"],
-    "opencode": ["opencode"],
-}
+HOSTS = ("claude", "codex", "gemini", "opencode")
 
 
 def detect() -> tuple[str, list[str]]:
@@ -51,12 +46,16 @@ def detect() -> tuple[str, list[str]]:
         p = psutil.Process(os.getppid())
         for _ in range(8):
             name = (p.name() or "").lower()
-            cmdline = " ".join(p.cmdline()).lower() if p.cmdline() else ""
-            for host, hints in NAME_HINTS.items():
-                for h in hints:
-                    if h in name or h in cmdline:
-                        signals.append(f"proc:{name or cmdline}")
-                        return host, signals
+            # Match the executable and its script (argv[0]/argv[1] basenames,
+            # covering `node /path/gemini.js`), never the full cmdline —
+            # arguments like `--roster codex` in a wrapping shell must not
+            # read as a codex host.
+            cmdline = p.cmdline() or []
+            heads = " ".join(os.path.basename(a).lower() for a in cmdline[:2])
+            for host in HOSTS:
+                if host in name or host in heads:
+                    signals.append(f"proc:{name or heads}")
+                    return host, signals
             parent = p.parent()
             if parent is None:
                 break
diff --git a/scripts/merge.py b/scripts/merge.py
index 92878e2..abe56b4 100644
--- a/scripts/merge.py
+++ b/scripts/merge.py
@@ -10,7 +10,6 @@
 import hashlib
 import json
 import sys
-import uuid
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
@@ -28,7 +27,7 @@ def _emit_cluster(file_path: str, cluster: list[dict], threshold: int, boost: in
     effective = min(100, max_conf + boost) if len(reviewers) >= 2 else max_conf
     if effective < threshold:
         return None
-    worst = max(cluster, key=lambda it: -SEVERITY_RANK.get(it.get("severity", "medium"), 3))
+    worst = min(cluster, key=lambda it: SEVERITY_RANK.get(it.get("severity", "medium"), 3))
     lines = sorted(int(it.get("line", 0) or 0) for it in cluster)
     anchor_line = lines[len(lines) // 2]  # median
     # Aggregate descriptions (cluster may merge independently-found issues)
@@ -204,7 +203,7 @@ def _to_gsd(result: dict, run_dir: Path) -> str:
     return "\n".join(lines)
 
 
-def _record_history(result: dict, run_dir: Path, cfg: dict) -> None:
+def _record_history(result: dict, run_dir: Path) -> None:
     """Append the run and its findings to history.db."""
     conn = None
     try:
@@ -268,12 +267,14 @@ def main() -> int:
     ap.add_argument("--run-dir", required=True)
     ap.add_argument("--threshold", type=int, default=None)
     ap.add_argument("--boost", type=int, default=None)
+    ap.add_argument("--line-tolerance", type=int, default=None)
     ap.add_argument("--output", default="md", choices=["md", "json", "gsd"])
     args = ap.parse_args()
 
     cfg = load_config()
     threshold = args.threshold if args.threshold is not None else cfg["defaults"]["confidence_threshold"]
     boost = args.boost if args.boost is not None else cfg["defaults"]["corroboration_boost"]
+    tolerance = args.line_tolerance if args.line_tolerance is not None else int(cfg["defaults"].get("merge_line_tolerance", 3))
 
     run_dir = Path(args.run_dir)
     reviews_dir = run_dir / "reviews"
@@ -282,7 +283,7 @@ def main() -> int:
         return 1
 
     reviewer_results = _load_reviewer_results(reviews_dir)
-    result = _merge(reviewer_results, threshold, boost)
+    result = _merge(reviewer_results, threshold, boost, line_tolerance=tolerance)
 
     # Always write metrics.json + merged.md; honor --output for stdout
     (run_dir / "metrics.json").write_text(
@@ -303,7 +304,7 @@ def main() -> int:
     else:
         print(md)
 
-    _record_history(result, run_dir, cfg)
+    _record_history(result, run_dir)
     return 0
 
 

From 9006faf8671f4f62c06f73fb266d9dad1d0aced2 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 02:15:07 -0700
Subject: [PATCH 11/14] docs: align README/DEVELOPMENT/CLAUDE/SKILL with fixed
 behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ARG_MAX gotcha marked fixed (all adapters pipe stdin); tree-kill gotcha
updated to point at the run_subprocess fix with the Windows re-test caveat;
CLAUDE.md no longer claims exact-file:line dedupe or lists ±3 bucketing as a
v2 idea (it exists — the doc was instructing future agents to re-implement
working code); host-CLI awareness section documents the explicit-roster
semantics; SKILL.md exit-code note distinguishes cost BLOCK from invalid
roster.

Confidence: high
Scope-risk: narrow
---
 CLAUDE.md      | 24 +++++++++++++-----------
 DEVELOPMENT.md |  4 ++--
 README.md      | 18 +++++++++---------
 SKILL.md       |  6 ++++--
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index c1a9c74..4172941 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -119,15 +119,21 @@ py -3.12 "$ARGUS_HOME/scripts/benchmark.py" --runs 3 --profile panel
 
 ### Host-CLI awareness
 `detect_host.py` inspects env + parent process tree. Returns one of
-`claude | codex | gemini | opencode | unknown`. `host_rules` in config.yaml
-removes the matching reviewer from the roster and (for non-claude hosts) adds
-`claude` in. **When running inside Claude Code, the `claude` reviewer is
-auto-skipped** — Claude's nested-invocation policy would block it anyway.
+`claude | codex | gemini | opencode | unknown`. `resolve_roster` (called by
+dispatch.py and benchmark.py) applies `host_rules` from config.yaml: the
+matching host's reviewer is always skipped (**inside Claude Code the `claude`
+reviewer is auto-skipped** — the nested-invocation policy would block it).
+`--roster` names are explicit: disabled/custom_only gates are waived and
+host_rules `add` does not inject extra reviewers; profile-based rosters get
+the full policy (disabled, tier, privacy, custom_only, host add).
 
 ### Confidence filter + corroboration
 - Drop findings with effective confidence < 80.
-- Bonus: +15 points when ≥2 reviewers flag the same `file:line` (capped at 100).
-- Merge dedupes by exact `file:line`. Nearby-line corroboration is NOT done today — reviewers often disagree on line numbers by ±3 within the same diff hunk; this under-counts corroboration. Candidate v2 fix: bucket by file + ±3-line range.
+- Bonus: +15 points when ≥2 reviewers flag the same issue (capped at 100).
+- Merge clusters findings by file + anchor-based ±3-line proximity
+  (`defaults.merge_line_tolerance`, `--line-tolerance`); the reported line is
+  the cluster median. Do NOT re-implement "±3 bucketing" — it already exists
+  in `merge.py:_merge`.
 
 ### Cost gates
 - Review: warn $0.50, hard block $2.00 — overridable with `--yes-cost`
@@ -163,9 +169,6 @@ auto-skipped** — Claude's nested-invocation policy would block it anyway.
   slow expensive frontier when F1 is comparable.
 
 ## v2 ideas (captured for later)
-- **Nearby-line corroboration in merge.** Currently bucket by exact
-  `file:line`; reviewers often differ by ±3 within the same hunk. Bucket
-  by file + ±3-line range to increase corroboration accuracy.
 - **User-action capture.** After merged.md is shown, collect accepted/rejected
   per finding into history.db. Use this for usage-driven preference weights.
 - **`--refresh` command.** Query OpenRouter `/models` and diff against pinned
@@ -180,9 +183,8 @@ auto-skipped** — Claude's nested-invocation policy would block it anyway.
 | MiniMax-M2.7 direct → works (slow, 20s for fixture) | — | — |
 | Kimi `KIMI_API_KEY` is consumer-only | forced OR primary (`moonshotai/kimi-k2.5`) | need Moonshot Platform dev key for k2.6-preview |
 | Nous Hermes direct → `NOUSRESEARCH_API_KEY` not in env | OR fallback works | set env var if wanted |
-| Gemini CLI slow (60-70s/call) | works, use sparingly for quick reviews | no fix — CLI startup time |
+| Gemini CLI disabled (timeout hang) | run_subprocess now tree-kills on timeout | re-test on Windows, then re-enable in config |
 | OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | set longer timeout in dispatch (already 180s) |
-| Merge doesn't apply line-tolerance to corroboration | under-counts corroborated findings | v2: ±3-line bucketing |
 | Fixtures only cover 4 scenarios | leaderboard is noisy at N=4 | author more fixtures (auth bypass, XSS, null-deref, async races, TOCTOU, etc.) |
 
 ## Reviewer recommendations (initial, pre-benchmark)
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index f2eaa03..920ad7d 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -185,7 +185,7 @@ python scripts/estimate_cost.py --roster "glm-5.1,minimax-m2.7,gemini-or,codex"
 
 ## Known Gotchas
 
-- **Windows .cmd shim tree-kill**: Node CLIs (gemini, copilot) don't tree-kill children on subprocess timeout. Use `gemini-or` instead.
+- **Windows .cmd shim tree-kill**: fixed — `run_subprocess` kills the whole process tree on timeout (killpg on POSIX, `taskkill /T /F` on Windows). gemini-direct stays disabled until re-tested on Windows; `gemini-or` remains the default route.
 - **OpenRouter reasoning providers**: `z-ai/glm-5.1` and `minimax/minimax-m2.7` may route to providers returning `{content: null}`. Mitigation: aichat patch applies reasoning-exclude + provider-ignore.
-- **Argv length on Windows (~32KB)**: gemini-cli and copilot-cli embed prompts as CLI args. Diffs >30KB fail. Use `--files` to scope.
+- **Argv length on Windows (~32KB)**: fixed — all CLI adapters pipe the prompt via stdin; no adapter embeds it in argv.
 - **Full-codebase audit prompt mismatch**: Default prompt optimized for PR review. Use `--overlay audit` for empty-tree→HEAD diffs.
diff --git a/README.md b/README.md
index e325c4e..4b9826a 100644
--- a/README.md
+++ b/README.md
@@ -479,13 +479,13 @@ argus/
 
 ```
 ┌──────────────────────────────────────────────────────────────────────────┐
-│  Windows .cmd shim tree-kill problem                                     │
+│  Windows .cmd shim tree-kill problem (FIXED — pending Windows re-test)   │
 │  ─────────────────────────────────────                                   │
-│  Node CLIs (gemini, copilot) don't tree-kill children on subprocess      │
-│  timeout. Causes zombie node.exe processes.                              │
+│  Node CLIs spawn a node child that used to survive subprocess timeout.   │
+│  run_subprocess now kills the whole process tree (killpg / taskkill /T). │
 │                                                                          │
-│  Mitigation: gemini-direct disabled by default; use gemini-or instead    │
-│  (same family via OpenRouter).                                           │
+│  gemini-direct stays disabled until re-tested on Windows; gemini-or      │
+│  (same family via OpenRouter) remains the default route.                 │
 ├──────────────────────────────────────────────────────────────────────────┤
 │  OpenRouter reasoning-provider trap                                      │
 │  ──────────────────────────────────                                      │
@@ -495,12 +495,12 @@ argus/
 │  Mitigation: aichat patch applies reasoning-exclude + provider-ignore.   │
 │  config.yaml: aichat_clients.openrouter.patch.chat_completions           │
 ├──────────────────────────────────────────────────────────────────────────┤
-│  Argv length on Windows (~32KB)                                          │
+│  Argv length on Windows (~32KB) — FIXED                                  │
 │  ──────────────────────────────                                          │
-│  gemini-cli and copilot-cli embed the prompt as a command-line arg.      │
-│  Diffs > 30KB fail.                                                      │
+│  All CLI adapters now pipe the prompt via stdin; no adapter embeds it    │
+│  as a command-line arg anymore, so large diffs no longer hit ARG_MAX.    │
 │                                                                          │
-│  Mitigation: use --files to scope, or rely on aichat adapters (stdin).   │
+│  (Historical: gemini-cli fixed 2026-05-06, copilot-cli fixed 2026-06.)   │
 ├──────────────────────────────────────────────────────────────────────────┤
 │  Full-codebase audit prompt mismatch                                     │
 │  ───────────────────────────────────                                     │
diff --git a/SKILL.md b/SKILL.md
index 3b9015a..e491e3f 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -97,8 +97,10 @@ If diff is empty after filtering, abort.
 python "$ARGUS_HOME/scripts/estimate_cost.py" --roster "$ROSTER" --diff "$RUN_DIR/diff.patch"
 ```
 
-Exit codes: 0 OK, 1 WARN (≥ warn), 2 BLOCK (≥ block).
-On BLOCK without `--yes-cost`: stop. Report estimate to user.
+Exit codes: 0 OK, 1 WARN (≥ warn threshold or low OpenRouter balance),
+2 BLOCK (≥ block threshold) **or invalid roster** — check stderr to tell them
+apart; `--yes-cost` downgrades a cost BLOCK to WARN but cannot fix an invalid
+roster. On BLOCK without `--yes-cost`: stop. Report estimate to user.
 
 If `--dry-run`: stop here and print roster + estimate. Do not dispatch.
 

From a36304b4dd4b0a9104dce74d65883ae81631e363 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 06:40:36 -0700
Subject: [PATCH 12/14] =?UTF-8?q?fix(review):=20address=20Copilot=20PR=20c?=
 =?UTF-8?q?omments=20=E2=80=94=20ts=20suffix=20boundary,=20exception=20stu?=
 =?UTF-8?q?b=20artifact?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stats.py --since now truncates both comparison sides to the common
YYYYMMDDTHHMMSS prefix; a timezone suffix on either the row or the cutoff
no longer skews exact-equality boundaries. dispatch.py writes
reviews/<name>.json for gather-level exception stubs too, preserving the
every-roster-entry-produces-an-artifact contract merge.py depends on.

Confidence: high
Scope-risk: narrow
---
 scripts/dispatch.py | 13 +++++++++----
 scripts/stats.py    | 10 ++++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/scripts/dispatch.py b/scripts/dispatch.py
index 0e7ca8a..a10a45c 100644
--- a/scripts/dispatch.py
+++ b/scripts/dispatch.py
@@ -147,10 +147,15 @@ async def _bounded(name: str) -> dict:
     results = []
     for n, r in zip(roster, gathered):
         if isinstance(r, BaseException):
-            results.append({"name": n, "findings": [], "exit_code": 1,
-                            "error": f"{type(r).__name__}: {r}"})
-        else:
-            results.append(r)
+            r = {"name": n, "findings": [], "exit_code": 1,
+                 "error": f"{type(r).__name__}: {r}"}
+            # Keep the contract that every roster entry produces a
+            # reviews/<name>.json — merge.py only reads those files.
+            try:
+                (reviews_dir / f"{n}.json").write_text(json.dumps(r, indent=2), encoding="utf-8")
+            except Exception:
+                pass
+        results.append(r)
 
     summary = {
         "roster": roster,
diff --git a/scripts/stats.py b/scripts/stats.py
index 54f4d41..4eec45e 100644
--- a/scripts/stats.py
+++ b/scripts/stats.py
@@ -32,10 +32,12 @@ def main() -> int:
     params: list = []
     if args.since:
         # runs.ts is dashed ISO-8601 ('2026-06-10T12:00:00+00:00') while
-        # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' from
-        # both sides so the lexical compare is valid for either format.
-        where = "WHERE REPLACE(REPLACE(ts, '-', ''), ':', '') >= ?"
-        params = [args.since.replace("-", "").replace(":", "")]
+        # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' and
+        # truncate both sides to the common YYYYMMDDTHHMMSS prefix so neither
+        # a timezone suffix on the row nor one on the cutoff skews the
+        # lexical compare at exact-equality boundaries.
+        where = "WHERE SUBSTR(REPLACE(REPLACE(ts, '-', ''), ':', ''), 1, 15) >= ?"
+        params = [args.since.replace("-", "").replace(":", "")[:15]]
 
     # Per-reviewer aggregates from reviewer_runs
     cur.execute(f"""

From cd30312bb89a77ee4562f0e9a9a9a3d942167219 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 06:54:36 -0700
Subject: [PATCH 13/14] docs: accuracy pass + diagrams across all five
 documents

Fixes remaining stale claims: DEVELOPMENT.md and SKILL.md still described
exact-file:line dedupe (merge clusters by anchor-based tolerance); the
roster table's gemini/copilot/claude notes predated the stdin and tree-kill
fixes; --timeout documented the old 180s default; the cost-flow chart drew
OR-balance as a block in review mode (it warns); SKILL.md roster resolution
was missing the disabled filter and the explicit-roster semantics;
CLAUDE.md layout comments showed prompt-as-arg invocations.

Adds: subagent-per-reviewer sequence diagram and leaderboard F1 bar chart
(README); roster-policy matrix, subprocess-layer notes, benchmark-scoring
flowchart, and history.db ER diagram (DEVELOPMENT.md); pytest/CI coverage
in DEVELOPMENT.md, CONTRIBUTING.md, and the layout trees; estimate_cost
exit-code reference. All 7 mermaid diagrams validated against the renderer.

Confidence: high
Scope-risk: narrow
---
 CLAUDE.md       |  11 ++--
 CONTRIBUTING.md |   9 +++
 DEVELOPMENT.md  | 148 +++++++++++++++++++++++++++++++++++++++++++++---
 README.md       |  85 ++++++++++++++++++++++-----
 SKILL.md        |  13 ++++-
 5 files changed, 239 insertions(+), 27 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 4172941..be5811a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -37,11 +37,11 @@ argus/
 │   └── adapters/
 │       ├── __init__.py        # ROUTE_ADAPTERS registry
 │       ├── aichat.py          # universal aichat wrapper (all 8 provider routes)
-│       ├── gemini_cli.py      # gemini --yolo -p "<prompt>"
+│       ├── gemini_cli.py      # gemini --yolo -p ""  (prompt via stdin)
 │       ├── codex_cli.py       # codex exec --skip-git-repo-check -  (stdin)
 │       ├── claude_cli.py      # claude -p --output-format text --bare  (stdin, nested-from-claude blocked)
 │       ├── opencode_cli.py    # opencode run -  (stdin)
-│       └── copilot_cli.py     # copilot -p "<prompt>" --model X --allow-all-tools
+│       └── copilot_cli.py     # copilot -p "<stdin ptr>" --model X  (prompt via stdin)
 ├── fixtures/                  # benchmark inputs (diff.patch + ground-truth.json pairs)
 │   ├── sql-injection/
 │   ├── race-refund/
@@ -143,7 +143,10 @@ the full policy (disabled, tier, privacy, custom_only, host add).
 ### JSON extraction
 `_common.extract_json` handles: raw JSON, `<think>` reasoning blocks
 (Qwen3 / DeepSeek-R1 style), fenced code blocks, and embedded objects by
-"findings" or "ok" key. Every aichat reviewer we tested returns usable JSON.
+"findings" or "ok" key. The balanced-brace scanners track string/escape
+state (braces inside descriptions are safe) and the last-resort fallback is
+a single O(n) pass capped at 50 parse attempts. Every aichat reviewer we
+tested returns usable JSON.
 
 ## Benchmark run protocol (learned from first full bench)
 
@@ -184,7 +187,7 @@ the full policy (disabled, tier, privacy, custom_only, host add).
 | Kimi `KIMI_API_KEY` is consumer-only | forced OR primary (`moonshotai/kimi-k2.5`) | need Moonshot Platform dev key for k2.6-preview |
 | Nous Hermes direct → `NOUSRESEARCH_API_KEY` not in env | OR fallback works | set env var if wanted |
 | Gemini CLI disabled (timeout hang) | run_subprocess now tree-kills on timeout | re-test on Windows, then re-enable in config |
-| OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | set longer timeout in dispatch (already 180s) |
+| OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | dispatch timeout already 360s |
 | Fixtures only cover 4 scenarios | leaderboard is noisy at N=4 | author more fixtures (auth bypass, XSS, null-deref, async races, TOCTOU, etc.) |
 
 ## Reviewer recommendations (initial, pre-benchmark)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f2e3e4d..f462ea1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -24,11 +24,20 @@ New fixtures are valuable — they sharpen the benchmark signal.
 - Include `scripts/verify.py --json --roster <name>` output for reachability issues
 - Report `aichat --version`, Python version, OS
 
+## Tests
+
+- `python -m pytest tests/ -q` must pass — CI runs it on every push/PR.
+  Tests need no network or API keys.
+- Bug fixes in `_common.py` (extract_json, run_subprocess, merge scoring)
+  should come with a regression test alongside the existing ones in `tests/`.
+
 ## Code style
 
 - Python 3.12+. Type hints where they clarify; not exhaustive.
 - No new dependencies without justification. pyyaml + psutil is the current bar.
 - Shell scripts must work on both Git Bash (Windows) and POSIX.
+- Roster policy lives only in `_common.resolve_roster` — don't add inline
+  roster filters to individual scripts.
 
 ## Don't break
 
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 920ad7d..73d71ab 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -75,6 +75,8 @@ argus/
 │       ├── opencode_cli.py
 │       └── copilot_cli.py
 ├── fixtures/                   # Benchmark inputs (diff.patch + ground-truth.json)
+├── tests/                      # Unit tests (pytest; run in CI)
+├── .github/workflows/ci.yml    # GitHub Actions — pytest on push/PR
 ├── runs/                       # Per-invocation artifacts (gitignored)
 ├── benchmarks/                 # Leaderboard outputs (gitignored)
 └── history.db                  # SQLite — runs, findings, benchmarks (gitignored)
@@ -141,31 +143,157 @@ python scripts/stats.py
 
 ## Architecture Notes
 
+### Roster Policy — single source of truth
+
+All roster policy lives in `_common.resolve_roster` — dispatch.py and
+benchmark.py both call it; **never add inline roster filters elsewhere**.
+Semantics:
+
+| | explicit `--roster` names | profile-based roster |
+|---|---|---|
+| `disabled: true` | kept (explicit naming = intent) | dropped |
+| `custom_only: true` | kept | dropped |
+| `tier: free` | needs `--allow-free` | needs `--allow-free` |
+| `privacy: LOGS` | needs `--allow-logging` | needs `--allow-logging` |
+| host_rules `skip` | applied | applied |
+| host_rules `add` | **not** applied | applied |
+| not in registry | dropped | dropped |
+
+Drops are written to `dispatch_summary.json` under `"dropped"` and noted on
+stderr. `estimate_cost.py` rejects unknown reviewer names outright (exit 2,
+labeled `INVALID ROSTER` so it can't be mistaken for a cost block).
+
 ### Host-CLI Awareness
 
-`detect_host.py` inspects env + parent process tree. Returns `claude | codex | gemini | opencode | unknown`. The matching reviewer is removed from the roster; non-claude hosts auto-add `claude`.
+`detect_host.py` inspects env markers + the parent process tree. Returns
+`claude | codex | gemini | opencode | unknown`. Env markers are specific
+variables (e.g. `CLAUDECODE=1`, `CODEX_SANDBOX`) — credential-shaped vars
+like `CODEX_API_KEY` don't trigger detection. The process walk (psutil, up
+to 8 levels) matches argv[0]/argv[1] basenames only, never full command
+lines, so a wrapping shell containing `--roster codex` can't misdetect.
 
 ### Confidence Filter + Corroboration
 
-- Findings with effective confidence < 80 are dropped
-- +15 confidence bonus (cap 100) when >=2 reviewers flag the same file:line
-- Merge dedupes by exact file:line
+- Findings with effective confidence < 80 are dropped (`defaults.confidence_threshold`)
+- +15 confidence bonus, cap 100, when >=2 reviewers agree (`defaults.corroboration_boost`)
+- Merge clusters findings per file by anchor-based line proximity within
+  `defaults.merge_line_tolerance` (default ±3, `--line-tolerance` to override);
+  the cluster reports its median line, worst severity, and concatenated
+  descriptions. See README "Merge logic" for the dual-tolerance walk.
 
 ### Cost Gates
 
 | Mode | Warn | Hard block | Override |
 |------|------|-----------|----------|
 | review | $0.50 | $2.00 | `--yes-cost` |
-| benchmark | $10 | $30 | `--yes-cost` |
-| OR balance | (auto) | `available < safety × estimate` | `--skip-balance-check` |
+| benchmark | $10 | $30 | `--yes-cost` / `ARGUS_YES_COST=1` |
+| OR balance | warn in review mode | blocks in benchmark mode | `--skip-balance-check` |
+
+`estimate_cost.py` exit codes: `0` OK, `1` warn, `2` block or invalid roster
+(check stderr). Paid-CLI reviewers (`cost_per_m: null`) count as $0.
+
+### Subprocess Layer
+
+`_common.run_subprocess` resolves argv[0] via a cached `shutil.which`, pipes
+the prompt on **stdin** (never argv — Windows ARG_MAX), and on timeout kills
+the entire process tree: children run in their own session, `os.killpg` on
+POSIX, `taskkill /T /F` on Windows. Adapters return a uniform dict:
+`{route, cmd, exit_code, stdout, stderr, latency_sec}`.
+
+`load_config()` is `lru_cache`d and returns a **shared dict — treat it as
+read-only**. The prompt template read is cached too.
 
 ### JSON Extraction
 
-`_common.extract_json` handles: raw JSON, `<think>` reasoning blocks (Qwen3/DeepSeek-R1), fenced code blocks, and embedded objects by "findings" or "ok" key.
+`_common.extract_json` tries, in order: raw parse → strip `<think>` blocks
+(Qwen3/DeepSeek-R1) → fenced code blocks → balanced object containing
+`"findings"`/`"ok"` → a single O(n) string-aware pass over all balanced
+`{...}` spans (max 50 parse attempts). The scanner tracks string/escape
+state, so braces inside description values don't truncate the object.
+`normalize_findings` clamps off-enum severities to `medium`.
+
+### Benchmark Scoring
+
+```mermaid
+flowchart TB
+    A["dispatch reviewer x fixture x run"] --> B{"exit_code == 0?"}
+    B -->|no| Z["zero-score<br>P=R=F1=0, fn=len(issues)"]
+    B -->|yes| C{"JSON parsed?"}
+    C -->|"no (parse_error)"| Z
+    C -->|yes| D{"fixture has issues?"}
+    D -->|"no (clean baseline)"| E{"findings empty?"}
+    E -->|yes| P["perfect<br>P=R=F1=1.0"]
+    E -->|no| Z2["all false positives<br>P=R=F1=0"]
+    D -->|yes| F["match by file +<br>line within tolerance"]
+    F --> G["tp/fp/fn -> P, R, F1"]
+
+    style Z fill:#ffb6c1,stroke:#c71585
+    style Z2 fill:#ffb6c1,stroke:#c71585
+    style P fill:#98fb98,stroke:#2e8b57
+```
+
+The clean-baseline `1.0` applies **only to successful, parseable runs** —
+a reviewer whose call failed or returned prose is zero-scored everywhere,
+so broken reviewers can't rank above working ones. `aggregate_bench.py`
+applies the same rule when re-scoring per-reviewer JSONs from
+parallel-shell runs. Benchmark timestamps are UTC.
+
+### history.db Schema
+
+```mermaid
+erDiagram
+    runs ||--o{ reviewer_runs : "run_id"
+    runs ||--o{ findings : "run_id"
+    runs {
+        TEXT run_id PK
+        TEXT ts "UTC ISO-8601"
+        TEXT roster
+        TEXT diff_sha256
+        REAL latency_sec
+    }
+    reviewer_runs {
+        TEXT run_id PK,FK
+        TEXT reviewer PK
+        TEXT route
+        INTEGER exit_code
+        INTEGER fallback_used
+        REAL latency_sec
+        INTEGER n_findings
+        TEXT error
+    }
+    findings {
+        TEXT run_id PK,FK
+        INTEGER idx PK
+        TEXT file
+        INTEGER line
+        TEXT severity
+        INTEGER confidence
+        INTEGER n_reviewers
+        TEXT reviewers
+    }
+    benchmarks {
+        TEXT ts PK "UTC compact YYYYMMDDTHHMMSS"
+        TEXT reviewer PK
+        TEXT fixture PK
+        INTEGER run_idx PK
+        REAL precision
+        REAL recall
+        REAL f1
+        REAL latency_sec
+        TEXT error
+    }
+```
+
+`benchmarks` is keyed independently (no FK to `runs`) — review runs and
+benchmark runs are separate timelines. `stats.py --since` normalizes both
+timestamp formats to a common `YYYYMMDDTHHMMSS` prefix before comparing.
 
 ## Testing
 
 ```bash
+# Unit tests (no network, no API keys; also run by CI on every push/PR)
+python -m pytest tests/ -q
+
 # Verify all routes are reachable
 python scripts/verify.py --all
 
@@ -177,6 +305,12 @@ python scripts/estimate_cost.py --roster "glm-5.1,minimax-m2.7,gemini-or,codex"
   --diff <(git diff HEAD)
 ```
 
+Unit coverage: `extract_json` edge cases (think-blocks, fences,
+braces-in-strings, O(n) garbage handling), merge clustering/corroboration,
+benchmark scoring, and `run_subprocess` timeout + process-tree kill.
+Always dry-run a benchmark (`--runs 1 --fixtures <one>`) with a new roster
+before a full run — it catches provider-config bugs in ~30s instead of 40min.
+
 ## Code Style
 
 - Python 3.12+. Type hints where they clarify; not exhaustive.
diff --git a/README.md b/README.md
index 4b9826a..982df4d 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,30 @@ Argus has two dispatch modes; the calling agent picks one per run:
   `dispatch.py --roster a,b,c,d` once. Argus's internal
   `defaults.max_parallel: 4` (config.yaml) still applies.
 
+```mermaid
+sequenceDiagram
+    participant Agent as calling agent
+    participant S1 as subagent 1..4 (wave)
+    participant D as dispatch.py
+    participant R as reviewer CLI / API
+    participant M as merge.py
+    Agent->>Agent: resolve roster + estimate cost
+    par one subagent per reviewer (max 4)
+        Agent->>S1: dispatch reviewer N
+        S1->>D: dispatch.py --roster NAME --run-dir RUN_DIR
+        D->>D: resolve_roster (policy + host rules)
+        D->>R: prompt via stdin
+        R-->>D: JSON findings
+        D->>D: write reviews/NAME.json
+        D-->>S1: dispatch_summary.json
+    end
+    Agent->>M: merge.py --run-dir RUN_DIR (exactly once)
+    M-->>Agent: merged.md + metrics.json + history.db row
+```
+
+Every roster entry produces a `reviews/<name>.json` — timeouts, crashes, and
+parse failures included — so `merge.py` always sees the full picture.
+
 See [`SKILL.md` step 6](SKILL.md) for the canonical execution recipe.
 
 ## Why
@@ -70,9 +94,9 @@ flowchart LR
     B --> C["roster resolution"]
     C --> D{"host detection"}
     D -->|claude| E1["skip claude"]
-    D -->|codex| E2["skip codex, add claude"]
-    D -->|gemini| E3["skip gemini, add claude"]
-    D -->|opencode| E4["skip opencode, add claude"]
+    D -->|codex| E2["skip codex (+claude for profiles)"]
+    D -->|gemini| E3["skip gemini (+claude for profiles)"]
+    D -->|opencode| E4["skip opencode (+claude for profiles)"]
     D -->|unknown| E5["no change"]
     E1 --> F
     E2 --> F
@@ -109,12 +133,12 @@ flowchart LR
 | `grok-4.20` | aichat → OR `x-ai/grok-4.20` | 2M ctx, pricey |
 | `deepseek-v3.2` | aichat → OR `deepseek/deepseek-v3.2` | cheap + fast |
 | `gemini-or` | aichat → OR `google/gemini-2.5-flash` | 2s/call, best value |
-| `gemini` | `gemini` CLI (paid sub) | disabled on Windows (.cmd tree-kill issue) |
+| `gemini` | `gemini` CLI (paid sub) | disabled pending Windows re-test of the tree-kill fix |
 | `codex` | `codex` CLI (paid sub) | GPT-5.x, thorough, slow |
-| `claude` | `claude` CLI (paid sub) | auto-added when host ≠ claude |
+| `claude` | `claude` CLI (paid sub) | auto-added to **profile** rosters when host ≠ claude |
 | `opencode` | `opencode` CLI (paid sub) | top performer, slow cold start |
 | `hermes-4.3` | aichat → Nous (fallback OR) | custom-only |
-| `copilot-gpt5` | GitHub `copilot` CLI | **disabled** — harness returns prose not JSON |
+| `copilot-gpt5` | GitHub `copilot` CLI | **disabled** — returned prose under the old prompt-as-arg invocation; re-test with the new stdin invocation |
 
 ## Profiles
 
@@ -148,7 +172,21 @@ flowchart LR
 | 10 | `hermes-4.3` | 0.551 | 0.646 | 0.653 | 13 |
 | 11 | `kimi-k2.6` | 0.505 | 0.729 | 0.575 | 83 |
 
-Your numbers will differ. Run `--benchmark` on your fixtures.
+```mermaid
+xychart-beta
+    title "F1 by reviewer (4 fixtures x 3 runs)"
+    x-axis ["opencode", "qwen-3.6", "glm-5.1", "gemini-or", "minimax", "mimo-v2", "codex", "deepseek", "grok-4.20", "hermes", "kimi-k2.6"]
+    y-axis "F1" 0 --> 1
+    bar [0.811, 0.761, 0.697, 0.681, 0.674, 0.652, 0.581, 0.572, 0.557, 0.551, 0.505]
+```
+
+Speed is a separate axis — `gemini-or` and `grok-4.20` answer in ~2s while
+`kimi-k2.6` takes ~83s for a *lower* F1; cost/latency/quality trade-offs are
+yours to pick per profile.
+
+Your numbers will differ. Run `--benchmark` on your fixtures. Failed or
+unparseable reviewer calls are zero-scored — a broken reviewer can't climb
+the board by "finding nothing" on the clean-baseline control.
 
 ---
 
@@ -246,11 +284,12 @@ python scripts/merge.py --run-dir "$RUN_DIR"
 | `--custom LIST` / `--models LIST` | one-off roster |
 | `--pr URL` / `--files GLOB` / `-` | diff source |
 | `--overlay {security,deep,audit}` | prompt overlay |
-| `--timeout N` | override 180s per-reviewer timeout |
-| `--yes-cost` / `ARGUS_YES_COST=1` | bypass cost + OR balance blocks |
+| `--timeout N` | override the 360s per-reviewer timeout (`defaults.reviewer_timeout_sec`) |
+| `--yes-cost` / `ARGUS_YES_COST=1` | downgrade a cost block to a warning |
 | `--skip-balance-check` | skip OR balance pre-flight |
 | `--allow-free` | include free-tier reviewers |
 | `--allow-logging` | include reviewers that log prompts |
+| `--line-tolerance N` | merge clustering tolerance (`defaults.merge_line_tolerance`, default 3) |
 | `--output {md,json,gsd}` | output format (`gsd` → `REVIEW.md` for `gsd-code-review-fix`) |
 | `--save-as NAME` | persist `--custom` roster as profile |
 
@@ -288,6 +327,8 @@ flowchart TB
 
 **Why dual tolerance.** A single forward-walking check causes chain drift — findings at lines 10, 13, 16, 19 would all collapse into one cluster (each within ±3 of the previous) even though lines 10 and 19 are 9 apart. Anchor-check alone handles this, but can reject near-duplicates that happen to land just over the anchor tolerance. Both checks together: tight when cluster stays compact, rejects drift when it doesn't.
 
+The tolerance is configurable: `defaults.merge_line_tolerance` in config.yaml, or `--line-tolerance` per run. The reported line for a cluster is the **median** of its members; the worst severity in the cluster wins; descriptions from each reviewer are concatenated, highest confidence first.
+
 ---
 
 ## Cost control
@@ -298,20 +339,27 @@ Two independent gates, both enforceable:
 flowchart LR
     A["Roster + diff"] --> B["Estimate tokens x rates"]
     B --> C{"est >= threshold?<br>review: $0.50 / $2<br>bench: $10 / $30"}
-    C -->|at or above block| X1["BLOCK<br>override: --yes-cost"]
+    C -->|at or above block| X1["BLOCK exit 2<br>override: --yes-cost"]
     C -->|below block| D{"any OR reviewer<br>in roster?"}
     D -->|no| F["dispatch"]
     D -->|yes| E["probe OR balance<br>auth/key + credits"]
     E --> G{"available >=<br>safety x est?"}
-    G -->|no| X2["BLOCK<br>override: --yes-cost<br>or --skip-balance-check"]
+    G -->|"no (review mode)"| W["WARN exit 1<br>top up before dispatching"]
+    G -->|"no (benchmark mode)"| X2["BLOCK<br>override: --yes-cost"]
     G -->|yes| F
+    W --> F
     F --> Z["run"]
 
     style X1 fill:#ffb6c1,stroke:#c71585
     style X2 fill:#ffb6c1,stroke:#c71585
+    style W fill:#ffe4b5,stroke:#b8860b
     style Z fill:#98fb98,stroke:#2e8b57
 ```
 
+`estimate_cost.py` exit codes: `0` OK · `1` warn (soft threshold or low OR
+balance) · `2` block **or invalid roster** (unknown reviewer names fail fast —
+check stderr; `--yes-cost` overrides a cost block only).
+
 | Mode | Warn | Hard block | Override |
 |---|---|---|---|
 | review | $0.50 | $2.00 | `--yes-cost` / `ARGUS_YES_COST=1` |
@@ -332,7 +380,9 @@ Paid-CLI reviewers (Gemini, Codex, Claude, OpenCode, Copilot) incur no tracked c
 | opencode | `opencode` | `claude` |
 | unknown | — | — |
 
-Rule: never ask the host CLI to review its own invocation. Always ensure Claude is in the mix when Claude isn't the host. Detection via env vars + parent-process walk (psutil, up to 8 levels).
+Rule: never ask the host CLI to review its own invocation — the `skip` column always applies. The `add` column applies to **profile-based rosters only**: an explicit `--roster` list is treated as intent and never silently gains (or keeps `disabled:`/`custom_only:` blocks against) reviewers you named yourself. Anything dropped is recorded in `dispatch_summary.json` under `"dropped"`.
+
+Detection: specific env markers per host (e.g. `CLAUDECODE=1`; a stray `CODEX_API_KEY` in your shell profile won't trigger it), then a parent-process walk (psutil, up to 8 levels) matching executable names only — never full command lines.
 
 ---
 
@@ -364,7 +414,7 @@ Produces:
 - `benchmarks/<ts>.json` — full machine-readable data
 - rows in `history.db:benchmarks` — longitudinal comparison
 
-The aggregator re-scores from `tp/fp/fn` per run, correctly handles clean-baseline (P=R=F1=1.0 when reviewer correctly finds nothing), and produces the unified leaderboard.
+The aggregator re-scores from `tp/fp/fn` per run, correctly handles clean-baseline (P=R=F1=1.0 when a **successful** reviewer correctly finds nothing), zero-scores failed/timed-out/unparseable calls, and produces the unified leaderboard.
 
 ---
 
@@ -416,7 +466,7 @@ Seeded fixtures:
 
 ## Quality mechanics (summary)
 
-1. **Strict-schema JSON** — reviewers return `{findings: [{file, line, severity, category, description, confidence}]}`. Extractor tolerates fenced blocks, `<think>` prefixes, stray prose.
+1. **Strict-schema JSON** — reviewers return `{findings: [{file, line, severity, category, description, confidence}]}`. Extractor tolerates fenced blocks, `<think>` prefixes, stray prose, and braces inside string values; the last-resort scan is a single O(n) pass with bounded parse attempts, so garbage output can't stall a run. Off-enum severities are clamped to `medium` rather than dropped.
 2. **Context-window pre-check** — skip reviewer if prompt > 70% of their ctx (reviewer's `skip_reason` logged; other reviewers continue).
 3. **Confidence threshold** — drop findings with effective confidence < 80.
 4. **Anchor-based clustering + corroboration boost** — +15 confidence (cap 100) when ≥ 2 reviewers agree within ±3 lines on the same file.
@@ -424,6 +474,7 @@ Seeded fixtures:
 6. **OpenRouter reasoning-exclude** — `patch.chat_completions` applies `reasoning: {exclude: true, max_tokens: 2000}` + `provider: {ignore: [io.net, together.ai]}` to avoid providers that return `content: null` with reasoning-only.
 7. **Per-reviewer incremental writes** — in benchmark mode, reviewer JSONs land as each reviewer completes. Tailable during runs.
 8. **Fallback routing** — every reviewer has an optional fallback (typically OR); sum of primary + fallback latency is reported so you see real wall time.
+9. **Failure isolation** — one broken reviewer never kills the run: per-reviewer exception handling, `gather(return_exceptions=True)`, and a guaranteed `reviews/<name>.json` artifact for every roster entry. On timeout the entire subprocess tree is killed (`killpg` / `taskkill /T /F`) — no orphaned node processes.
 
 ---
 
@@ -468,6 +519,12 @@ argus/
 │   ├── race-refund/
 │   ├── secrets-leak/
 │   └── clean-baseline/
+├── tests/                      # unit tests (pytest; run in CI)
+│   ├── test_extract_json.py
+│   ├── test_merge.py
+│   ├── test_run_subprocess.py
+│   └── test_score.py
+├── .github/workflows/ci.yml    # GitHub Actions — pytest on push/PR
 ├── runs/                       # per-invocation artifacts (gitignored)
 ├── benchmarks/                 # leaderboard outputs (gitignored)
 └── history.db                  # SQLite — runs, findings, benchmarks (gitignored)
diff --git a/SKILL.md b/SKILL.md
index e491e3f..1493303 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -69,8 +69,11 @@ Apply `host_rules[$HOST]` from config.yaml: remove `skip` list, append `add` lis
 
 1. Load config.yaml
 2. Base list from `--profile` / `--custom` / `--models` / default profile
-3. Apply host rules (step 2 above)
+3. Apply host rules (step 2 above). For explicitly-named rosters
+   (`--custom` / `--models`) apply only the `skip` list — never inject
+   `add` reviewers the user didn't ask for.
 4. Apply filters:
+   - drop reviewers with `disabled: true` unless explicitly named
    - drop reviewers with `tier: free` unless `--allow-free`
    - drop reviewers with `privacy: LOGS` unless `--allow-logging`
    - drop reviewers with `custom_only: true` unless explicitly named
@@ -78,6 +81,11 @@ Apply `host_rules[$HOST]` from config.yaml: remove `skip` list, append `add` lis
 
 If roster is empty, abort with message and list disabled reasons.
 
+Defense in depth: `dispatch.py` and `benchmark.py` enforce this same policy
+internally via `_common.resolve_roster` (drops recorded in
+`dispatch_summary.json` under `"dropped"`), and `estimate_cost.py` rejects
+unknown reviewer names — so a skipped filter here fails safe downstream.
+
 ## 4. Gather scope → diff
 
 Produce a single diff.patch in the new run directory:
@@ -158,7 +166,8 @@ run `merge.py` exactly once in the main session (step 7).
 python "$ARGUS_HOME/scripts/merge.py" --run-dir "$RUN_DIR"
 ```
 
-Applies confidence threshold, corroboration boost, dedupes by file:line,
+Applies the confidence threshold, corroboration boost, and anchor-based
+±3-line clustering (`defaults.merge_line_tolerance` / `--line-tolerance`),
 writes `merged.md` and `metrics.json`. Also appends a row to `history.db`.
 
 ## 8. Present

From 3fb38ae55729e36f397b7c5e6485ad6e594030a6 Mon Sep 17 00:00:00 2001
From: Jim Stratus <stratus@eotir.com>
Date: Wed, 10 Jun 2026 07:04:10 -0700
Subject: [PATCH 14/14] fix(review): require a findings list before treating
 parsed JSON as a valid review

extract_json can recover an arbitrary inner object from prose, so a parsed
dict without a findings list is still a failed review: benchmark mode now
sets parse_error (zero-scored instead of F1=1.0 on clean-baseline) and
dispatch records parse_error + raw_preview instead of silently reporting
zero findings. A legitimate {"findings": []} still counts as success.

Confidence: high
Scope-risk: narrow
---
 scripts/benchmark.py | 6 ++++--
 scripts/dispatch.py  | 7 ++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index dfb3d32..624b842 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -144,8 +144,10 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict:
     parse_error = False
     if r["exit_code"] == 0:
         parsed = extract_json(r["stdout"])
-        if isinstance(parsed, dict):
-            findings = normalize_findings(parsed.get("findings", []))
+        # Require the schema's findings list — extract_json can recover an
+        # arbitrary inner object from prose, which is still a failed review.
+        if isinstance(parsed, dict) and isinstance(parsed.get("findings"), list):
+            findings = normalize_findings(parsed["findings"])
         else:
             parse_error = True
     total_latency = primary_latency + fallback_latency
diff --git a/scripts/dispatch.py b/scripts/dispatch.py
index a10a45c..a3b0c75 100644
--- a/scripts/dispatch.py
+++ b/scripts/dispatch.py
@@ -75,13 +75,14 @@ async def _dispatch_one(name: str, spec: dict, prompt: str, timeout: int) -> dic
         return result
 
     parsed = extract_json(stdout)
-    if parsed is None:
+    # Require the schema's findings list — extract_json can recover an
+    # arbitrary inner object from prose, which is still a failed review.
+    if not isinstance(parsed, dict) or not isinstance(parsed.get("findings"), list):
         result["parse_error"] = True
         result["raw_preview"] = stdout[:2000]
         return result
 
-    findings_raw = parsed.get("findings") if isinstance(parsed, dict) else None
-    result["findings"] = normalize_findings(findings_raw or [])
+    result["findings"] = normalize_findings(parsed["findings"])
     return result