From 68137659cfc3b5022f1a354dc822c348dec5d8d9 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 01/14] fix(dispatch): isolate reviewer exceptions + enforce roster policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit asyncio.gather now runs with return_exceptions=True and _dispatch_one is wrapped in try/except, so one misconfigured reviewer (e.g. KeyError from a bad aichat client key) can no longer cancel sibling reviewers or prevent dispatch_summary.json from being written. Wire resolve_roster + detect_host into roster parsing: disabled, tier, privacy, custom_only and host_rules policies are now enforced on every dispatch (previously resolve_roster had zero callers and the documented claude-host auto-skip never happened). Explicitly-named reviewers count as explicit_custom_only so custom-only entries stay reachable. Confidence: high Scope-risk: moderate Directive: keep return_exceptions=True + per-reviewer try/except — one broken reviewer must never kill the run Not-tested: live multi-reviewer dispatch with a real misconfigured route --- scripts/dispatch.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/scripts/dispatch.py b/scripts/dispatch.py index 1c16f23..eb1a141 100644 --- a/scripts/dispatch.py +++ b/scripts/dispatch.py @@ -19,8 +19,9 @@ sys.path.insert(0, str(Path(__file__).resolve().parent)) from _common import ( load_config, build_prompt, estimate_tokens, extract_json, - normalize_findings, ARGUS_HOME, + normalize_findings, resolve_roster, ) +from detect_host import detect as detect_host import adapters @@ -91,7 +92,21 @@ async def _main_async(args) -> int: max_parallel = int(defaults["max_parallel"]) ctx_safety = float(defaults["ctx_safety_ratio"]) - roster = [r.strip() for r in args.roster.split(",") if r.strip()] + names = [r.strip() for r in args.roster.split(",") if r.strip()] + host, _ = detect_host() + # Explicitly-named reviewers count as explicit_custom_only so custom_only + # entries stay usable; disabled/tier/privacy/host_rules policies still apply. + roster, drops = resolve_roster( + cfg, "custom", names, host, + allow_free=bool(args.allow_free or defaults.get("allow_free")), + allow_logging=bool(args.allow_logging or defaults.get("allow_logging")), + explicit_custom_only=set(names), + ) + for n, reason in drops: + sys.stderr.write(f"roster: dropped {n} — {reason}\n") + if not roster: + sys.stderr.write("roster: empty after policy filtering\n") + return 1 diff = Path(args.diff).read_text(encoding="utf-8", errors="replace") prompt = build_prompt(diff, overlay=args.overlay) @@ -121,11 +136,22 @@ async def _bounded(name: str) -> dict: (reviews_dir / f"{name}.json").write_text(json.dumps(r, indent=2), encoding="utf-8") return r - r = await _dispatch_one(name, spec, prompt, timeout) + try: + r = await _dispatch_one(name, spec, prompt, timeout) + except Exception as e: + r = {"name": name, "findings": [], "exit_code": 1, + "error": f"{type(e).__name__}: {e}"} (reviews_dir / f"{name}.json").write_text(json.dumps(r, indent=2), encoding="utf-8") return r - results = await asyncio.gather(*[_bounded(n) for n in roster]) + gathered = await asyncio.gather(*[_bounded(n) for n in roster], return_exceptions=True) + results = [] + for n, r in zip(roster, gathered): + if isinstance(r, BaseException): + results.append({"name": n, "findings": [], "exit_code": 1, + "error": f"{type(r).__name__}: {r}"}) + else: + results.append(r) summary = { "roster": roster, @@ -157,6 +183,8 @@ def main() -> int: ap.add_argument("--diff", required=True) ap.add_argument("--overlay", default=None, help="prompt overlay name (security|deep|...)") ap.add_argument("--timeout", type=int, default=None, help="override reviewer_timeout_sec for this run") + ap.add_argument("--allow-free", action="store_true", help="include free-tier reviewers") + ap.add_argument("--allow-logging", action="store_true", help="include reviewers with privacy: LOGS") args = ap.parse_args() return asyncio.run(_main_async(args)) From a4dc34ad6fb97e27a503de9cd3571e3b1ada2a46 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 02/14] fix(bench): zero-score failed calls instead of perfect F1 on clean-baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _bench_reviewer now scores any run with non-zero exit_code as P=R=F1=0 (a failed call is not "correctly found nothing"), and aggregate_bench's _rescore_run skips the tp==fp==fn==0 -> 1.0 clean-baseline rule for runs with an exit_code/error — previously wall-cap stubs were rescored from a deliberate 0.0 back to a perfect 1.0. Also drop disabled reviewers from profile-derived benchmark rosters (explicit --roster still forces them). Constraint: clean-baseline 1.0 only for successful runs with zero findings Confidence: high Scope-risk: narrow --- scripts/aggregate_bench.py | 5 +++++ scripts/benchmark.py | 23 ++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/scripts/aggregate_bench.py b/scripts/aggregate_bench.py index 528106b..23823ff 100644 --- a/scripts/aggregate_bench.py +++ b/scripts/aggregate_bench.py @@ -26,7 +26,12 @@ def _rescore_run(run: dict) -> dict: """Recompute P/R/F1 from tp/fp/fn with the clean-baseline rule (repairs pre-fix data). Clean-baseline rule: tp==fp==fn==0 → P=R=F1=1.0 (reviewer correctly found nothing). + Failed calls (non-zero exit_code or an error, e.g. wall-cap stubs) are + zero-scored — tp==fp==fn==0 there means "never ran", not "found nothing". """ + if int(run.get("exit_code", 0) or 0) != 0 or run.get("error"): + run["precision"], run["recall"], run["f1"] = 0.0, 0.0, 0.0 + return run tp = int(run.get("tp", 0) or 0) fp = int(run.get("fp", 0) or 0) fn = int(run.get("fn", 0) or 0) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index fa2e676..1b9cf18 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -186,7 +186,13 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict], except Exception as e: await _log_progress(f"{name:<16} {fx['name']:<18} run {idx+1}/{runs} EXCEPTION: {type(e).__name__}: {str(e)[:80]}") d = {"findings": [], "latency_sec": 0.0, "exit_code": 1, "error": f"{type(e).__name__}: {e}"} - scored = _score(d["findings"], fx["ground_truth"]) + if d.get("exit_code", 1) != 0: + # A failed call is not "correctly found nothing" — zero-score it + # so broken reviewers can't earn F1=1.0 on clean-baseline. + scored = {"tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])), + "precision": 0.0, "recall": 0.0, "f1": 0.0} + else: + scored = _score(d["findings"], fx["ground_truth"]) run_data.append({ "run_idx": idx, "n_findings": len(d["findings"]), @@ -402,8 +408,19 @@ async def _main_async(args) -> int: else: roster = list(cfg["profiles"]["panel"]["members"]) - # Filter out reviewers not in registry - roster = [n for n in roster if n in cfg["reviewers"]] + # Filter out reviewers not in registry; drop disabled ones unless the user + # named them explicitly via --roster (explicit naming = intent to test). + explicit = bool(args.roster) + kept = [] + for n in roster: + if n not in cfg["reviewers"]: + print(f"skip {n}: not in registry", file=sys.stderr) + continue + if cfg["reviewers"][n].get("disabled") and not explicit: + print(f"skip {n}: disabled in config (name via --roster to force)", file=sys.stderr) + continue + kept.append(n) + roster = kept total_calls = len(roster) * len(fixtures) * args.runs print(f"Benchmarking {len(roster)} reviewers × {len(fixtures)} fixtures × {args.runs} runs = {total_calls} calls", file=sys.stderr) From b530397438ba4a2ad9e88e7d7636d5fbd0f90782 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 03/14] fix(cost): honor 0/1/2 exit contract, add --yes-cost, reject unknown reviewers The trailing return-1 inside the balance-check block made every default invocation exit warn even at $0.00 (and --skip-balance-check masked real warns as 0). Exit code is now computed once: 0 ok, 1 warn (threshold or OR balance), 2 block. --yes-cost exists now and downgrades a block to a warn, matching the message and SKILL.md. Unknown roster names fail fast with exit 2 instead of being costed as $0 "paid CLI sub". Rejected: removing the --yes-cost mention from the block message | docs and CLAUDE.md promise the override Confidence: high Scope-risk: narrow --- scripts/estimate_cost.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/scripts/estimate_cost.py b/scripts/estimate_cost.py index 41b8d40..d3bfd1e 100644 --- a/scripts/estimate_cost.py +++ b/scripts/estimate_cost.py @@ -28,6 +28,7 @@ def main() -> int: ap.add_argument("--runs-per-fixture", type=int, default=1) ap.add_argument("--fixtures", type=int, default=1) ap.add_argument("--skip-balance-check", action="store_true", help="skip the OpenRouter balance pre-flight") + ap.add_argument("--yes-cost", action="store_true", help="downgrade a cost block to a warning (exit 1 instead of 2)") args = ap.parse_args() cfg = load_config() @@ -39,6 +40,14 @@ def main() -> int: in_tokens = estimate_tokens(diff_text) + prompt_overhead roster = [r.strip() for r in args.roster.split(",") if r.strip()] + unknown = [n for n in roster if n not in cfg["reviewers"]] + if unknown: + sys.stderr.write( + f"ERROR: unknown reviewer(s): {', '.join(unknown)}. " + f"Known: {', '.join(sorted(cfg['reviewers']))}\n" + ) + return 2 + rows = [] total = 0.0 per_unit_multiplier = args.runs_per_fixture * args.fixtures @@ -78,11 +87,17 @@ def main() -> int: } print(json.dumps(out, indent=2)) + rc = 0 if total >= block: - sys.stderr.write(f"\nBLOCK: estimated ${total:.2f} exceeds hard cap ${block:.2f}. Pass --yes-cost to override.\n") - return 2 - if total >= warn: + if args.yes_cost: + sys.stderr.write(f"\nWARN: estimated ${total:.2f} exceeds hard cap ${block:.2f} — overridden with --yes-cost.\n") + rc = 1 + else: + sys.stderr.write(f"\nBLOCK: estimated ${total:.2f} exceeds hard cap ${block:.2f}. Pass --yes-cost to override.\n") + return 2 + elif total >= warn: sys.stderr.write(f"\nWARN: estimated ${total:.2f} exceeds soft threshold ${warn:.2f}.\n") + rc = 1 # OR balance pre-flight for non-dry invocations if not args.skip_balance_check: @@ -99,12 +114,11 @@ def main() -> int: available = info.get("available_usd") safety = float(d.get("or_balance_safety_factor", 2.0)) if available is not None and available < total * safety: - sys.stderr.write(f"\nOR BALANCE WARN: available ${available:.4f} < {safety}× estimate ${total:.4f}. Top up or use --yes-cost at dispatch time.\n") - return max(1, 1) # warn-level; dispatch will block via its own gate + sys.stderr.write(f"\nOR BALANCE WARN: available ${available:.4f} < {safety}× estimate ${total:.4f}. Top up before dispatching.\n") + rc = max(rc, 1) except Exception as e: sys.stderr.write(f"OR balance check failed (non-fatal): {e}\n") - return 1 - return 0 + return rc if __name__ == "__main__": From d26b5687ad158ca2869f52059fc68bf53942c290 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 04/14] fix(common): string-aware brace scan in extract_json; clamp off-enum severities The balanced-brace fallback counted braces inside JSON string values, so a finding description containing "}" truncated the chunk and dropped the whole reviewer response as parse_error. The scanner now tracks in-string/escape state. normalize_findings clamps severities outside the prompt enum to medium so _to_gsd and SEVERITY_RANK consumers can no longer silently drop or mis-rank them. Adds regression tests for both. Confidence: high Scope-risk: narrow Not-tested: exotic reviewer outputs with unbalanced quotes outside JSON --- scripts/_common.py | 21 ++++++++++++++++++--- tests/test_extract_json.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/scripts/_common.py b/scripts/_common.py index 77d1198..f41c785 100644 --- a/scripts/_common.py +++ b/scripts/_common.py @@ -103,9 +103,22 @@ def extract_json(text: str) -> dict | None: if start < 0: continue depth = 0 + in_str = False + esc = False for i in range(start, len(text)): c = text[i] - if c == "{": + if in_str: + # Braces inside string values must not move the depth counter. + if esc: + esc = False + elif c == "\\": + esc = True + elif c == '"': + in_str = False + continue + if c == '"': + in_str = True + elif c == "{": depth += 1 elif c == "}": depth -= 1 @@ -115,7 +128,6 @@ def extract_json(text: str) -> dict | None: return json.loads(chunk) except Exception: break - break return None @@ -127,10 +139,13 @@ def normalize_findings(raw: Any) -> list[dict]: if not isinstance(f, dict): continue try: + sev = str(f.get("severity", "medium")).lower().strip() + if sev not in SEVERITY_RANK: + sev = "medium" out.append({ "file": str(f.get("file", "")).strip(), "line": int(f.get("line", 0) or 0), - "severity": str(f.get("severity", "medium")).lower().strip(), + "severity": sev, "category": str(f.get("category", "bug")).lower().strip(), "description": str(f.get("description", "")).strip(), "confidence": max(0, min(100, int(f.get("confidence", 50) or 50))), diff --git a/tests/test_extract_json.py b/tests/test_extract_json.py index 0961c2f..e5c3cfe 100644 --- a/tests/test_extract_json.py +++ b/tests/test_extract_json.py @@ -6,7 +6,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts")) -from _common import extract_json # noqa: E402 +from _common import extract_json, normalize_findings # noqa: E402 def test_direct_json(): @@ -57,6 +57,34 @@ def test_nested_objects(): assert r["findings"][0]["meta"]["nested"] is True +def test_brace_inside_string_value(): + """Braces inside string values must not break the balanced-brace scan.""" + text = ('Here is my review: {"findings": [{"file": "a.py", "line": 1, ' + '"description": "unmatched } and { braces in a format string"}]}') + r = extract_json(text) + assert isinstance(r, dict) + assert r["findings"][0]["file"] == "a.py" + + +def test_escaped_quote_inside_string_value(): + text = ('prose {"findings": [{"file": "a.py", ' + '"description": "says \\"x}\\" here"}]}') + r = extract_json(text) + assert isinstance(r, dict) + assert "x}" in r["findings"][0]["description"] + + +def test_normalize_findings_clamps_unknown_severity(): + """Off-enum severities map to medium so no consumer silently drops them.""" + raw = [ + {"file": "a.py", "line": 1, "severity": "BLOCKER", "description": "x", "confidence": 90}, + {"file": "b.py", "line": 2, "severity": "high", "description": "y", "confidence": 90}, + ] + out = normalize_findings(raw) + assert out[0]["severity"] == "medium" + assert out[1]["severity"] == "high" + + if __name__ == "__main__": import traceback tests = [ @@ -67,6 +95,9 @@ def test_nested_objects(): test_empty_input_returns_none, test_malformed_returns_none, test_nested_objects, + test_brace_inside_string_value, + test_escaped_quote_inside_string_value, + test_normalize_findings_clamps_unknown_severity, ] failed = 0 for t in tests: From 177b057b5845176d4925c319935f5d6041ced0f1 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 05/14] fix(adapter): copilot-cli pipes prompt via stdin (Windows ARG_MAX) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same fix as gemini-cli (91b2f2e): the full prompt left argv — where >32 KB diffs hit the Windows CreateProcess command-line limit — and is now piped on stdin with a short -p pointer. This also removes the broken redaction (cmd[:6] kept the full prompt at index 2 while appending a decoy token). Constraint: Copilot CLI programmatic mode combines piped stdin with -p Confidence: medium Scope-risk: narrow Not-tested: live copilot CLI call (not installed in this environment); verify on Windows before relying on the copilot reviewer --- config.yaml | 6 +++--- scripts/adapters/copilot_cli.py | 23 +++++++++-------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/config.yaml b/config.yaml index 6f59a60..9d79643 100644 --- a/config.yaml +++ b/config.yaml @@ -293,6 +293,6 @@ cli_commands: codex-cli: ["codex", "exec", "--skip-git-repo-check", "-"] # stdin input; skip git-repo trust check claude-cli: ["claude", "-p", "--output-format", "text", "--bare"] # stdin opencode-cli: ["opencode", "run", "-"] # stdin input - copilot-cli: ["copilot", "-p", "{prompt}", "--model", "{model}", - "--allow-all-tools", "--no-color", - "--output-format", "text"] # prompt-as-arg (~30KB Windows arg limit) + copilot-cli: ["copilot", "-p", "Follow the review instructions provided on stdin.", + "--model", "{model}", "--allow-all-tools", "--no-color", + "--output-format", "text"] # prompt via STDIN (Windows ARG_MAX; same fix as gemini-cli) diff --git a/scripts/adapters/copilot_cli.py b/scripts/adapters/copilot_cli.py index 324e6d6..e29e704 100644 --- a/scripts/adapters/copilot_cli.py +++ b/scripts/adapters/copilot_cli.py @@ -1,9 +1,12 @@ """GitHub Copilot CLI adapter. Copilot CLI uses GPT-family models (e.g., gpt-5.2) through the user's paid -GitHub Copilot subscription. Invocation: - copilot -p "" --model --allow-all-tools \ - --no-color --no-banner --output-format text +GitHub Copilot subscription. The full prompt is piped via STDIN (combined by +the CLI with the short -p pointer) — the previous {prompt}-in-argv approach +breaks on Windows for prompts > ~32 KB (ARG_MAX limit), the same bug fixed +for gemini-cli. Invocation: + copilot -p "" --model --allow-all-tools \ + --no-color --output-format text Value vs direct GPT routes: - Uses the user's Copilot paid sub (no API spend) @@ -26,22 +29,14 @@ async def send(prompt: str, route_cfg: dict, timeout: int) -> dict: cfg = load_config() template = list(cfg["cli_commands"]["copilot-cli"]) model = route_cfg.get("model", "") - cmd = [] - for part in template: - if part == "{model}": - cmd.append(model) - elif part == "{prompt}": - cmd.append(prompt) - else: - cmd.append(part) + cmd = [model if part == "{model}" else part for part in template] env = os.environ.copy() env.setdefault("COPILOT_ALLOW_ALL", "1") - # Prompt already embedded as arg; no stdin needed - rc, stdout, stderr, dt = await run_subprocess(cmd, "", timeout, env=env) + rc, stdout, stderr, dt = await run_subprocess(cmd, prompt, timeout, env=env) return { "route": "copilot-cli", "model": model, - "cmd": cmd[:6] + ["...", ""] if len(cmd) > 6 else cmd, + "cmd": cmd, "exit_code": rc, "stdout": stdout, "stderr": stderr, From abd6f144e6b5534d496d0b0c5e995f4dbdbae9d4 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 01:52:14 -0700 Subject: [PATCH 06/14] fix(stats,host): --since works on benchmarks table; codex detection markers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stats.py compared dashed ISO --since values against the benchmarks table's compact %Y%m%dT%H%M%S timestamps, where '0' > '-' at index 4 made every 2026+ row pass any cutoff. Both sides are now normalized (strip -/:) before the lexical compare. detect_host.py no longer treats any CODEX_* env var as proof of a Codex host — a user's exported CODEX_API_KEY misrouted the roster; detection now matches specific session markers only. Confidence: high Scope-risk: narrow Not-tested: exact env markers a live Codex CLI session sets; parent-process walk remains the backstop --- scripts/detect_host.py | 9 +++++++-- scripts/stats.py | 7 +++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/detect_host.py b/scripts/detect_host.py index ff08e73..2a3798b 100644 --- a/scripts/detect_host.py +++ b/scripts/detect_host.py @@ -30,8 +30,13 @@ def detect() -> tuple[str, list[str]]: if "CLAUDE_CODE_SESSION" in env or "CLAUDE_CODE_SIMPLE" in env: signals.append("env:CLAUDE_CODE_*") return "claude", signals - if any(k.startswith("CODEX_") for k in env): - signals.append("env:CODEX_*") + # Specific session markers only — a prefix scan would misfire on user + # credentials like CODEX_API_KEY exported in a shell profile. + codex_markers = ("CODEX_SANDBOX", "CODEX_SANDBOX_NETWORK_DISABLED", + "CODEX_THREAD_ID", "CODEX_SESSION_ID") + codex_hit = next((k for k in codex_markers if k in env), None) + if codex_hit: + signals.append(f"env:{codex_hit}") return "codex", signals if env.get("GEMINI_CLI") == "1" or "GEMINI_CLI_SESSION" in env: signals.append("env:GEMINI_CLI") diff --git a/scripts/stats.py b/scripts/stats.py index 9c9efd0..54f4d41 100644 --- a/scripts/stats.py +++ b/scripts/stats.py @@ -31,8 +31,11 @@ def main() -> int: where = "" params: list = [] if args.since: - where = "WHERE ts >= ?" - params = [args.since] + # runs.ts is dashed ISO-8601 ('2026-06-10T12:00:00+00:00') while + # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' from + # both sides so the lexical compare is valid for either format. + where = "WHERE REPLACE(REPLACE(ts, '-', ''), ':', '') >= ?" + params = [args.since.replace("-", "").replace(":", "")] # Per-reviewer aggregates from reviewer_runs cur.execute(f""" From 3dc1e3cfc7b804d1cf2b2c5611dec96389ecba17 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 02:15:07 -0700 Subject: [PATCH 07/14] fix(roster): explicit --roster semantics; one policy engine for dispatch+benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-1's resolve_roster wiring applied profile-grade policy to explicit rosters: explicitly-named disabled reviewers were hard-dropped with no force option, and host_rules add injected claude into every dispatch — under the SKILL.md subagent-per-reviewer pattern that meant K concurrent claude calls racing on reviews/claude.json. resolve_roster gains explicit=True: named reviewers waive disabled/custom_only, host add is not applied; host skip and tier/privacy gates still hold. benchmark.py now uses the same resolver (profiles get full policy incl. the claude-host auto-skip), dispatch records drops in dispatch_summary.json, and estimate_cost's unknown-reviewer error is labeled INVALID ROSTER so it cannot be read as a cost BLOCK. Constraint: explicit naming = intent; host skip is never waived (nested-invocation safety) Rejected: keeping benchmark's inline filter | third divergent policy copy was the bug Confidence: high Scope-risk: moderate Directive: roster policy lives ONLY in resolve_roster — never add inline filters --- scripts/_common.py | 142 +++++++++++++++++++++++++++++++-------- scripts/benchmark.py | 71 +++++++++++--------- scripts/dispatch.py | 12 ++-- scripts/estimate_cost.py | 9 +-- 4 files changed, 166 insertions(+), 68 deletions(-) diff --git a/scripts/_common.py b/scripts/_common.py index f41c785..ae4adc5 100644 --- a/scripts/_common.py +++ b/scripts/_common.py @@ -2,9 +2,11 @@ from __future__ import annotations import asyncio +import functools import json import os import re +import signal import sqlite3 import subprocess import sys @@ -28,19 +30,27 @@ SEVERITY_RANK = {"critical": 0, "high": 1, "medium": 2, "low": 3} +@functools.lru_cache(maxsize=8) def load_config(path: Path | None = None) -> dict: + """Parse config.yaml once per process. Returns a shared cached dict — + treat it as read-only (every adapter call goes through here).""" p = path or CONFIG_PATH with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) +@functools.lru_cache(maxsize=16) +def _read_text_cached(path: Path) -> str: + return path.read_text(encoding="utf-8") + + def build_prompt(diff: str, overlay: str | None = None) -> str: - tpl = PROMPT_PATH.read_text(encoding="utf-8") + tpl = _read_text_cached(PROMPT_PATH) overlay_text = "" if overlay: overlay_path = OVERLAYS_DIR / f"{overlay}.md" if overlay_path.exists(): - overlay_text = overlay_path.read_text(encoding="utf-8") + overlay_text = _read_text_cached(overlay_path) # Escape ``` in the diff so it cannot terminate the outer fence early. # Use a zero-width joiner between backticks; reviewers see visually-identical # content while the fence stays intact. @@ -97,8 +107,35 @@ def extract_json(text: str) -> dict | None: continue start_range = [text.rfind("{", 0, idx)] else: - # Try every `{` from largest block first - start_range = [i for i, c in enumerate(text) if c == "{"] + # Last resort: one O(n) string-aware pass collecting every balanced + # {...} span via a stack, then bounded parse attempts in document + # order. (The old per-'{' rescan was O(n²) — minutes of CPU on + # brace-heavy non-JSON reviewer output.) + spans: list[tuple[int, int]] = [] + stack: list[int] = [] + in_str = esc = False + for i, c in enumerate(text): + if in_str: + if esc: + esc = False + elif c == "\\": + esc = True + elif c == '"': + in_str = False + continue + if c == '"': + in_str = True + elif c == "{": + stack.append(i) + elif c == "}" and stack: + spans.append((stack.pop(), i)) + spans.sort() + for start, end in spans[:50]: + try: + return json.loads(text[start : end + 1]) + except Exception: + continue + return None for start in start_range: if start < 0: continue @@ -155,6 +192,12 @@ def normalize_findings(raw: Any) -> list[dict]: return out +@functools.lru_cache(maxsize=32) +def _which_cached(name: str) -> str | None: + import shutil + return shutil.which(name) + + def _resolve_cmd(cmd: list[str]) -> list[str]: """On Windows, npm-installed shims are `.cmd` files that bash finds via PATH but Python's subprocess (with shell=False) does not. Use shutil.which to @@ -162,46 +205,76 @@ def _resolve_cmd(cmd: list[str]) -> list[str]: """ if not cmd: return cmd - import shutil - resolved = shutil.which(cmd[0]) + resolved = _which_cached(cmd[0]) if resolved: return [resolved] + cmd[1:] return cmd +def _kill_tree(proc: subprocess.Popen) -> None: + """Kill the process AND its children. npm .cmd shims spawn a node grandchild + that survives a plain proc.kill(), holds the captured pipes, and hangs the + run — the bug that got the gemini reviewer disabled.""" + try: + if os.name == "nt": + subprocess.run(["taskkill", "/T", "/F", "/PID", str(proc.pid)], + capture_output=True, check=False) + else: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + except Exception: + try: + proc.kill() + except Exception: + pass + + async def run_subprocess(cmd: list[str], stdin_data: str, timeout: int, env: dict | None = None, cwd: str | None = None) -> tuple[int, str, str, float]: """Run a subprocess with stdin + timeout. Returns (rc, stdout, stderr, elapsed_sec). - Uses subprocess.run inside a thread for simplicity and Windows compatibility. - Resolves argv[0] via shutil.which so npm/.cmd shims work. + Runs in a thread for simplicity and Windows compatibility. Resolves argv[0] + via shutil.which so npm/.cmd shims work. On timeout the whole process TREE + is killed (POSIX: own session + killpg; Windows: taskkill /T /F). """ resolved_cmd = _resolve_cmd(cmd) def _runit() -> tuple[int, str, str, float]: t0 = time.monotonic() + popen_kwargs: dict = {} + if os.name != "nt": + popen_kwargs["start_new_session"] = True try: - result = subprocess.run( + proc = subprocess.Popen( resolved_cmd, - input=stdin_data.encode("utf-8"), - capture_output=True, - timeout=timeout, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, env=env, cwd=cwd, - check=False, shell=False, + **popen_kwargs, ) + except FileNotFoundError as e: + return 127, "", f"command not found: {e}", time.monotonic() - t0 + except Exception as e: + return 1, "", f"launch error: {type(e).__name__}: {e}", time.monotonic() - t0 + try: + out, err = proc.communicate(input=stdin_data.encode("utf-8"), timeout=timeout) return ( - result.returncode, - result.stdout.decode("utf-8", errors="replace"), - result.stderr.decode("utf-8", errors="replace"), + proc.returncode, + out.decode("utf-8", errors="replace"), + err.decode("utf-8", errors="replace"), time.monotonic() - t0, ) except subprocess.TimeoutExpired: - return 124, "", "timeout", time.monotonic() - t0 - except FileNotFoundError as e: - return 127, "", f"command not found: {e}", time.monotonic() - t0 + _kill_tree(proc) + try: + proc.communicate(timeout=5) + except Exception: + pass + return 124, "", "timeout (process tree killed)", time.monotonic() - t0 except Exception as e: + _kill_tree(proc) return 1, "", f"launch error: {type(e).__name__}: {e}", time.monotonic() - t0 return await asyncio.to_thread(_runit) @@ -209,8 +282,16 @@ def _runit() -> tuple[int, str, str, float]: def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str, allow_free: bool = False, allow_logging: bool = False, - explicit_custom_only: set[str] | None = None) -> tuple[list[str], list[tuple[str, str]]]: - """Return (final_roster, drop_reasons).""" + explicit_custom_only: set[str] | None = None, + explicit: bool = False) -> tuple[list[str], list[tuple[str, str]]]: + """Return (final_roster, drop_reasons). + + explicit=True means the caller passed these exact names by hand + (dispatch/benchmark --roster): disabled and custom_only gates are waived + (explicit naming = intent) and host_rules `add` does not inject reviewers + the caller never asked for. host_rules `skip` and the tier/privacy gates + still apply — they guard external consequences, not reviewer health. + """ explicit_custom_only = explicit_custom_only or set() reviewers = cfg["reviewers"] profiles = cfg["profiles"] @@ -224,19 +305,24 @@ def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str, base = list(profiles[cfg["defaults"]["profile"]]["members"]) # Host adaptation - base = [n for n in base if n not in set(host_rules.get("skip", []))] - for n in host_rules.get("add", []): - if n not in base: - base.append(n) - + skipped_by_host = set(host_rules.get("skip", [])) drops: list[tuple[str, str]] = [] + for n in base: + if n in skipped_by_host: + drops.append((n, f"host rule (running inside {host})")) + base = [n for n in base if n not in skipped_by_host] + if not explicit: + for n in host_rules.get("add", []): + if n not in base: + base.append(n) + final = [] for n in base: spec = reviewers.get(n) if not spec: drops.append((n, "not in registry")) continue - if spec.get("disabled"): + if spec.get("disabled") and not explicit: drops.append((n, "disabled in config")) continue if spec.get("tier") == "free" and not allow_free: @@ -245,7 +331,7 @@ def resolve_roster(cfg: dict, mode: str, names: list[str] | None, host: str, if spec.get("privacy") == "LOGS" and not allow_logging: drops.append((n, "logs prompts (use --allow-logging)")) continue - if spec.get("custom_only") and n not in explicit_custom_only: + if spec.get("custom_only") and not explicit and n not in explicit_custom_only: drops.append((n, "custom-only (name via --custom/--models)")) continue final.append(n) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 1b9cf18..dfb3d32 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -17,19 +17,18 @@ import argparse import asyncio -import hashlib import json import os import sys import time -from collections import defaultdict from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from _common import ( load_config, build_prompt, extract_json, normalize_findings, - ARGUS_HOME, history_conn, + ARGUS_HOME, history_conn, resolve_roster, ) +from detect_host import detect as detect_host import adapters @@ -122,6 +121,8 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict: if adapter is None: return {"findings": [], "latency_sec": 0.0, "primary_latency_sec": 0.0, "fallback_latency_sec": 0.0, "fallback_used": False, + "exit_code": 1, "primary_exit_code": 1, "primary_error": "", + "parse_error": False, "error": f"no adapter for {primary.get('route')}"} r = await adapter.send(prompt, primary, timeout) primary_latency = r.get("latency_sec", 0.0) @@ -140,10 +141,13 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict: fallback_used = True r = r2 findings = [] + parse_error = False if r["exit_code"] == 0: parsed = extract_json(r["stdout"]) if isinstance(parsed, dict): findings = normalize_findings(parsed.get("findings", [])) + else: + parse_error = True total_latency = primary_latency + fallback_latency return { "findings": findings, @@ -154,6 +158,7 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict: "exit_code": r["exit_code"], "primary_exit_code": primary_exit, "primary_error": primary_err, + "parse_error": parse_error, "error": (r.get("stderr") or "")[:200] if r["exit_code"] != 0 else None, } @@ -166,7 +171,7 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict], wall_start = time.monotonic() for fx in fixtures: - prompt = build_prompt(fx["diff"]) + prompt = fx["prompt"] run_data = [] merged_keys: set[tuple[str, int]] = set() for idx in range(runs): @@ -174,7 +179,7 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict], await _log_progress(f"{name:<16} WALL-CAP HIT at {int(time.monotonic()-wall_start)}s — skipping remaining runs") run_data.append({ "run_idx": idx, "n_findings": 0, "latency_sec": 0.0, - "exit_code": 137, "error": "wall-cap exceeded", + "exit_code": 137, "parse_error": False, "error": "wall-cap exceeded", "tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])), "precision": 0.0, "recall": 0.0, "f1": 0.0, }) @@ -185,10 +190,11 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict], d = await _dispatch(name, spec, prompt, timeout) except Exception as e: await _log_progress(f"{name:<16} {fx['name']:<18} run {idx+1}/{runs} EXCEPTION: {type(e).__name__}: {str(e)[:80]}") - d = {"findings": [], "latency_sec": 0.0, "exit_code": 1, "error": f"{type(e).__name__}: {e}"} - if d.get("exit_code", 1) != 0: - # A failed call is not "correctly found nothing" — zero-score it - # so broken reviewers can't earn F1=1.0 on clean-baseline. + d = {"findings": [], "latency_sec": 0.0, "exit_code": 1, + "parse_error": False, "error": f"{type(e).__name__}: {e}"} + if d.get("exit_code", 1) != 0 or d.get("parse_error"): + # A failed or unparseable call is not "correctly found nothing" — + # zero-score it so broken reviewers can't earn F1=1.0 on clean-baseline. scored = {"tp": 0, "fp": 0, "fn": len(fx["ground_truth"].get("issues", [])), "precision": 0.0, "recall": 0.0, "f1": 0.0} else: @@ -197,8 +203,9 @@ async def _bench_reviewer(name: str, spec: dict, fixtures: list[dict], "run_idx": idx, "n_findings": len(d["findings"]), "latency_sec": d["latency_sec"], - "exit_code": d["exit_code"], - "error": d["error"], + "exit_code": d.get("exit_code", 1), + "parse_error": d.get("parse_error", False), + "error": d.get("error"), **scored, }) for f in d["findings"]: @@ -401,26 +408,24 @@ async def _main_async(args) -> int: sys.stderr.write("no fixtures found. Seed fixtures/ before running benchmark.\n") return 1 + # Same policy engine as dispatch.py: --roster names are explicit (disabled/ + # custom_only waived, no host_rules add); profiles get full policy. + host, _ = detect_host() if args.roster: - roster = [r.strip() for r in args.roster.split(",") if r.strip()] - elif args.profile: - roster = list(cfg["profiles"][args.profile]["members"]) + names = [r.strip() for r in args.roster.split(",") if r.strip()] + roster, drops = resolve_roster( + cfg, "custom", names, host, + allow_free=args.allow_free, allow_logging=args.allow_logging, + explicit=True, + ) else: - roster = list(cfg["profiles"]["panel"]["members"]) - - # Filter out reviewers not in registry; drop disabled ones unless the user - # named them explicitly via --roster (explicit naming = intent to test). - explicit = bool(args.roster) - kept = [] - for n in roster: - if n not in cfg["reviewers"]: - print(f"skip {n}: not in registry", file=sys.stderr) - continue - if cfg["reviewers"][n].get("disabled") and not explicit: - print(f"skip {n}: disabled in config (name via --roster to force)", file=sys.stderr) - continue - kept.append(n) - roster = kept + profile = args.profile or "panel" + roster, drops = resolve_roster( + cfg, "profile", [profile], host, + allow_free=args.allow_free, allow_logging=args.allow_logging, + ) + for n, reason in drops: + print(f"skip {n}: {reason}", file=sys.stderr) total_calls = len(roster) * len(fixtures) * args.runs print(f"Benchmarking {len(roster)} reviewers × {len(fixtures)} fixtures × {args.runs} runs = {total_calls} calls", file=sys.stderr) @@ -487,7 +492,11 @@ async def _main_async(args) -> int: _PROGRESS_COUNTER["done"] = 0 _PROGRESS_START = time.monotonic() - ts = args.benchmark_ts or time.strftime("%Y%m%dT%H%M%S") + # UTC so benchmarks.ts and runs.ts (also UTC) share one --since timeline. + ts = args.benchmark_ts or time.strftime("%Y%m%dT%H%M%S", time.gmtime()) + # Prompts are reviewer-independent — build each fixture's once. + for fx in fixtures: + fx["prompt"] = build_prompt(fx["diff"]) sem = asyncio.Semaphore(max_parallel) tasks = [_bench_reviewer(n, cfg["reviewers"][n], fixtures, args.runs, timeout, sem, ts=ts, max_wall_sec=args.max_wall_sec) for n in roster] @@ -527,6 +536,8 @@ def main() -> int: ap.add_argument("--max-wall-sec", type=int, default=600, help="hard wall-time cap per reviewer (default 600s)") ap.add_argument("--yes-cost", action="store_true", help="bypass benchmark cost block + OR balance gate (env: ARGUS_YES_COST=1)") ap.add_argument("--skip-balance-check", action="store_true", help="skip the OpenRouter balance pre-flight") + ap.add_argument("--allow-free", action="store_true", help="include free-tier reviewers") + ap.add_argument("--allow-logging", action="store_true", help="include reviewers with privacy: LOGS") args = ap.parse_args() return asyncio.run(_main_async(args)) diff --git a/scripts/dispatch.py b/scripts/dispatch.py index eb1a141..0e7ca8a 100644 --- a/scripts/dispatch.py +++ b/scripts/dispatch.py @@ -94,13 +94,13 @@ async def _main_async(args) -> int: names = [r.strip() for r in args.roster.split(",") if r.strip()] host, _ = detect_host() - # Explicitly-named reviewers count as explicit_custom_only so custom_only - # entries stay usable; disabled/tier/privacy/host_rules policies still apply. + # --roster names are explicit: disabled/custom_only are waived and + # host_rules `add` is not applied; host `skip` and tier/privacy still hold. roster, drops = resolve_roster( cfg, "custom", names, host, allow_free=bool(args.allow_free or defaults.get("allow_free")), allow_logging=bool(args.allow_logging or defaults.get("allow_logging")), - explicit_custom_only=set(names), + explicit=True, ) for n, reason in drops: sys.stderr.write(f"roster: dropped {n} — {reason}\n") @@ -120,9 +120,8 @@ async def _main_async(args) -> int: async def _bounded(name: str) -> dict: async with sem: - spec = cfg["reviewers"].get(name) - if not spec: - return {"name": name, "error": "unknown reviewer"} + # resolve_roster guarantees roster ⊆ registry + spec = cfg["reviewers"][name] # Context-window pre-check ctx = int(spec.get("ctx", 0) or 0) @@ -155,6 +154,7 @@ async def _bounded(name: str) -> dict: summary = { "roster": roster, + "dropped": {n: reason for n, reason in drops}, "prompt_tokens_est": prompt_tokens, "reviewers": { r["name"]: { diff --git a/scripts/estimate_cost.py b/scripts/estimate_cost.py index d3bfd1e..e7a8796 100644 --- a/scripts/estimate_cost.py +++ b/scripts/estimate_cost.py @@ -12,6 +12,7 @@ import argparse import json +import os import sys from pathlib import Path @@ -43,7 +44,8 @@ def main() -> int: unknown = [n for n in roster if n not in cfg["reviewers"]] if unknown: sys.stderr.write( - f"ERROR: unknown reviewer(s): {', '.join(unknown)}. " + f"INVALID ROSTER (not a cost block — --yes-cost will not help): " + f"unknown reviewer(s): {', '.join(unknown)}. " f"Known: {', '.join(sorted(cfg['reviewers']))}\n" ) return 2 @@ -106,11 +108,10 @@ def main() -> int: or (cfg["reviewers"].get(name, {}).get("fallback", {}).get("client") == "openrouter") for name in roster ) - import os as _os - if uses_openrouter and _os.environ.get("OPENROUTER_API_KEY"): + if uses_openrouter and os.environ.get("OPENROUTER_API_KEY"): try: from or_balance import probe - info = probe(_os.environ["OPENROUTER_API_KEY"]) + info = probe(os.environ["OPENROUTER_API_KEY"]) available = info.get("available_usd") safety = float(d.get("or_balance_safety_factor", 2.0)) if available is not None and available < total * safety: From 3cc58fbcc26d2fbdfa0fca5471c77ad26d71f018 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 02:15:07 -0700 Subject: [PATCH 08/14] fix(bench): zero-score unparseable output; no-adapter KeyError; UTC ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A reviewer exiting 0 with unparseable prose scored F1=1.0 on clean-baseline through the parse-failure path the round-1 gate missed — _dispatch now records parse_error, and both the scoring gate and aggregate_bench's _rescore_run treat it as a failed call. The no-adapter early return gains exit_code so run_data.append no longer KeyErrors (which made misconfigured reviewers vanish from the leaderboard as fatal stubs). benchmarks.ts is now UTC so --since shares one timeline with runs.ts; fixture prompts are built once instead of per reviewer. Confidence: high Scope-risk: narrow Not-tested: pre-existing benchmarks rows keep local-time ts (error bounded by UTC offset) --- scripts/aggregate_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/aggregate_bench.py b/scripts/aggregate_bench.py index 23823ff..fb96361 100644 --- a/scripts/aggregate_bench.py +++ b/scripts/aggregate_bench.py @@ -26,10 +26,10 @@ def _rescore_run(run: dict) -> dict: """Recompute P/R/F1 from tp/fp/fn with the clean-baseline rule (repairs pre-fix data). Clean-baseline rule: tp==fp==fn==0 → P=R=F1=1.0 (reviewer correctly found nothing). - Failed calls (non-zero exit_code or an error, e.g. wall-cap stubs) are + Failed calls (non-zero exit_code, an error, or unparseable output) are zero-scored — tp==fp==fn==0 there means "never ran", not "found nothing". """ - if int(run.get("exit_code", 0) or 0) != 0 or run.get("error"): + if int(run.get("exit_code", 0) or 0) != 0 or run.get("error") or run.get("parse_error"): run["precision"], run["recall"], run["f1"] = 0.0, 0.0, 0.0 return run tp = int(run.get("tp", 0) or 0) From f560d7c339b9f52a2452bc81b8c543d1da530731 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 02:15:07 -0700 Subject: [PATCH 09/14] fix(common): kill process tree on timeout; O(n) extract_json fallback; cache config/template/which MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_subprocess now runs children in their own session and kills the whole tree on timeout (killpg on POSIX, taskkill /T /F on Windows) — the root cause behind the disabled gemini reviewer's .cmd-shim hang; verified by a test that confirms the orphaned grandchild dies. extract_json's last-resort fallback is one string-aware pass collecting balanced spans (was O(n²) — measured 19s on 24KB of brace garbage; now bounded at 50 parse attempts). load_config, the prompt template read, and shutil.which are lru_cached (~36ms YAML parse per adapter call removed; no caller mutates the cfg dict). Constraint: load_config returns a shared dict — treat as read-only Confidence: high Scope-risk: moderate Directive: gemini reviewer stays disabled until the tree-kill is verified on Windows Not-tested: taskkill path (Windows); span-capped fallback on >50 decoy spans before the real object --- tests/test_extract_json.py | 18 ++++++++ tests/test_run_subprocess.py | 82 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/test_run_subprocess.py diff --git a/tests/test_extract_json.py b/tests/test_extract_json.py index e5c3cfe..0185a52 100644 --- a/tests/test_extract_json.py +++ b/tests/test_extract_json.py @@ -74,6 +74,22 @@ def test_escaped_quote_inside_string_value(): assert "x}" in r["findings"][0]["description"] +def test_inner_object_recovered_when_outer_braces_invalid(): + """Last-resort span scan finds a nested valid object inside brace garbage.""" + text = 'junk { junk {"a": 1} junk }' + assert extract_json(text) == {"a": 1} + + +def test_brace_garbage_returns_none_fast(): + """O(n) span fallback: brace-heavy non-JSON must not take quadratic time.""" + import time + garbage = ("{ x " * 4000) + ("} y " * 4000) + t0 = time.monotonic() + r = extract_json(garbage) + assert time.monotonic() - t0 < 2.0 + assert r is None + + def test_normalize_findings_clamps_unknown_severity(): """Off-enum severities map to medium so no consumer silently drops them.""" raw = [ @@ -97,6 +113,8 @@ def test_normalize_findings_clamps_unknown_severity(): test_nested_objects, test_brace_inside_string_value, test_escaped_quote_inside_string_value, + test_inner_object_recovered_when_outer_braces_invalid, + test_brace_garbage_returns_none_fast, test_normalize_findings_clamps_unknown_severity, ] failed = 0 diff --git a/tests/test_run_subprocess.py b/tests/test_run_subprocess.py new file mode 100644 index 0000000..c1d972a --- /dev/null +++ b/tests/test_run_subprocess.py @@ -0,0 +1,82 @@ +"""Unit tests for scripts/_common.py::run_subprocess — timeout + tree-kill.""" +from __future__ import annotations + +import asyncio +import os +import shutil +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts")) + +from _common import run_subprocess # noqa: E402 + +HAS_BASH = shutil.which("bash") is not None + + +def test_success_passes_stdin_and_captures_stdout(): + if not HAS_BASH: + return + rc, out, err, dt = asyncio.run(run_subprocess(["bash", "-c", "cat"], "hello", 10)) + assert rc == 0 + assert out == "hello" + + +def test_timeout_returns_124_promptly(): + if not HAS_BASH: + return + t0 = time.monotonic() + rc, out, err, dt = asyncio.run(run_subprocess(["bash", "-c", "sleep 30"], "", 1)) + assert rc == 124 + assert "timeout" in err + assert time.monotonic() - t0 < 10 # killed, not waited out + + +def test_timeout_kills_grandchildren(): + """A shell that spawns a sleeping child must not leave the child running + after timeout — the .cmd-shim hang that got the gemini reviewer disabled.""" + if not HAS_BASH or os.name == "nt": + return + import tempfile + pidfile = tempfile.NamedTemporaryFile(suffix=".pid", delete=False).name + try: + rc, out, err, dt = asyncio.run( + run_subprocess(["bash", "-c", f"sleep 30 & echo $! > {pidfile}; wait"], "", 1) + ) + assert rc == 124 + child_pid = int(Path(pidfile).read_text().strip()) + time.sleep(0.5) # let init reap the reparented child + try: + os.kill(child_pid, 0) + alive = Path(f"/proc/{child_pid}/stat").read_text().split()[2] != "Z" + except (ProcessLookupError, FileNotFoundError): + alive = False + assert not alive, f"grandchild {child_pid} survived the tree kill" + finally: + os.unlink(pidfile) + + +def test_command_not_found_returns_127(): + rc, out, err, dt = asyncio.run(run_subprocess(["argus-no-such-binary-xyz"], "", 5)) + assert rc == 127 + + +if __name__ == "__main__": + import traceback + tests = [ + test_success_passes_stdin_and_captures_stdout, + test_timeout_returns_124_promptly, + test_timeout_kills_grandchildren, + test_command_not_found_returns_127, + ] + failed = 0 + for t in tests: + try: + t() + print(f"PASS {t.__name__}") + except Exception as e: + failed += 1 + print(f"FAIL {t.__name__}: {type(e).__name__}: {e}") + traceback.print_exc() + sys.exit(failed) From c7b940802bee7d6a20f031f2e311adf4ec7e123c Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 02:15:07 -0700 Subject: [PATCH 10/14] chore: dead-code sweep; config-driven merge line tolerance; argv[0]-scoped host detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit merge.py: drop unused uuid import and the phantom cfg param on _record_history; replace the max(-key) double negation with min. benchmark dead imports removed in the roster commit. detect_host: NAME_HINTS flattened to a tuple, and the process walk matches argv[0]/argv[1] basenames instead of the full cmdline — a wrapping shell containing "--roster codex" no longer detects as a codex host. merge clustering tolerance is now defaults.merge_line_tolerance / --line-tolerance, matching its sibling knobs. Confidence: high Scope-risk: narrow --- config.yaml | 3 ++- scripts/detect_host.py | 23 +++++++++++------------ scripts/merge.py | 11 ++++++----- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/config.yaml b/config.yaml index 9d79643..7bf76fb 100644 --- a/config.yaml +++ b/config.yaml @@ -23,6 +23,7 @@ defaults: allow_logging: false # require --allow-logging to include privacy: LOGS confidence_threshold: 80 # drop findings with effective confidence < this corroboration_boost: 15 # confidence bonus when 2+ reviewers agree on same file:line + merge_line_tolerance: 3 # cluster findings within +/- this many lines (merge.py) ctx_safety_ratio: 0.70 # skip reviewer if prompt_tokens > ctx * this reviewer_timeout_sec: 360 # bumped from 180 on 2026-05-06 after a 4-doc # ~80K-token bundle had 5/7 reviewers timeout. @@ -141,7 +142,7 @@ reviewers: ctx: 2000000 tier: paid privacy: ok - disabled: true # on hold — use gemini-or instead + disabled: true # on hold — run_subprocess now tree-kills on timeout; re-test on Windows before re-enabling (use gemini-or meanwhile) cost_per_m: null primary: { route: gemini-cli } fallback: { route: aichat, client: openrouter, model: "google/gemini-3.1-pro-preview" } diff --git a/scripts/detect_host.py b/scripts/detect_host.py index 2a3798b..256cb23 100644 --- a/scripts/detect_host.py +++ b/scripts/detect_host.py @@ -11,12 +11,7 @@ import sys -NAME_HINTS = { - "claude": ["claude"], - "codex": ["codex"], - "gemini": ["gemini"], - "opencode": ["opencode"], -} +HOSTS = ("claude", "codex", "gemini", "opencode") def detect() -> tuple[str, list[str]]: @@ -51,12 +46,16 @@ def detect() -> tuple[str, list[str]]: p = psutil.Process(os.getppid()) for _ in range(8): name = (p.name() or "").lower() - cmdline = " ".join(p.cmdline()).lower() if p.cmdline() else "" - for host, hints in NAME_HINTS.items(): - for h in hints: - if h in name or h in cmdline: - signals.append(f"proc:{name or cmdline}") - return host, signals + # Match the executable and its script (argv[0]/argv[1] basenames, + # covering `node /path/gemini.js`), never the full cmdline — + # arguments like `--roster codex` in a wrapping shell must not + # read as a codex host. + cmdline = p.cmdline() or [] + heads = " ".join(os.path.basename(a).lower() for a in cmdline[:2]) + for host in HOSTS: + if host in name or host in heads: + signals.append(f"proc:{name or heads}") + return host, signals parent = p.parent() if parent is None: break diff --git a/scripts/merge.py b/scripts/merge.py index 92878e2..abe56b4 100644 --- a/scripts/merge.py +++ b/scripts/merge.py @@ -10,7 +10,6 @@ import hashlib import json import sys -import uuid from collections import defaultdict from datetime import datetime, timezone from pathlib import Path @@ -28,7 +27,7 @@ def _emit_cluster(file_path: str, cluster: list[dict], threshold: int, boost: in effective = min(100, max_conf + boost) if len(reviewers) >= 2 else max_conf if effective < threshold: return None - worst = max(cluster, key=lambda it: -SEVERITY_RANK.get(it.get("severity", "medium"), 3)) + worst = min(cluster, key=lambda it: SEVERITY_RANK.get(it.get("severity", "medium"), 3)) lines = sorted(int(it.get("line", 0) or 0) for it in cluster) anchor_line = lines[len(lines) // 2] # median # Aggregate descriptions (cluster may merge independently-found issues) @@ -204,7 +203,7 @@ def _to_gsd(result: dict, run_dir: Path) -> str: return "\n".join(lines) -def _record_history(result: dict, run_dir: Path, cfg: dict) -> None: +def _record_history(result: dict, run_dir: Path) -> None: """Append the run and its findings to history.db.""" conn = None try: @@ -268,12 +267,14 @@ def main() -> int: ap.add_argument("--run-dir", required=True) ap.add_argument("--threshold", type=int, default=None) ap.add_argument("--boost", type=int, default=None) + ap.add_argument("--line-tolerance", type=int, default=None) ap.add_argument("--output", default="md", choices=["md", "json", "gsd"]) args = ap.parse_args() cfg = load_config() threshold = args.threshold if args.threshold is not None else cfg["defaults"]["confidence_threshold"] boost = args.boost if args.boost is not None else cfg["defaults"]["corroboration_boost"] + tolerance = args.line_tolerance if args.line_tolerance is not None else int(cfg["defaults"].get("merge_line_tolerance", 3)) run_dir = Path(args.run_dir) reviews_dir = run_dir / "reviews" @@ -282,7 +283,7 @@ def main() -> int: return 1 reviewer_results = _load_reviewer_results(reviews_dir) - result = _merge(reviewer_results, threshold, boost) + result = _merge(reviewer_results, threshold, boost, line_tolerance=tolerance) # Always write metrics.json + merged.md; honor --output for stdout (run_dir / "metrics.json").write_text( @@ -303,7 +304,7 @@ def main() -> int: else: print(md) - _record_history(result, run_dir, cfg) + _record_history(result, run_dir) return 0 From 9006faf8671f4f62c06f73fb266d9dad1d0aced2 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 02:15:07 -0700 Subject: [PATCH 11/14] docs: align README/DEVELOPMENT/CLAUDE/SKILL with fixed behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ARG_MAX gotcha marked fixed (all adapters pipe stdin); tree-kill gotcha updated to point at the run_subprocess fix with the Windows re-test caveat; CLAUDE.md no longer claims exact-file:line dedupe or lists ±3 bucketing as a v2 idea (it exists — the doc was instructing future agents to re-implement working code); host-CLI awareness section documents the explicit-roster semantics; SKILL.md exit-code note distinguishes cost BLOCK from invalid roster. Confidence: high Scope-risk: narrow --- CLAUDE.md | 24 +++++++++++++----------- DEVELOPMENT.md | 4 ++-- README.md | 18 +++++++++--------- SKILL.md | 6 ++++-- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index c1a9c74..4172941 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -119,15 +119,21 @@ py -3.12 "$ARGUS_HOME/scripts/benchmark.py" --runs 3 --profile panel ### Host-CLI awareness `detect_host.py` inspects env + parent process tree. Returns one of -`claude | codex | gemini | opencode | unknown`. `host_rules` in config.yaml -removes the matching reviewer from the roster and (for non-claude hosts) adds -`claude` in. **When running inside Claude Code, the `claude` reviewer is -auto-skipped** — Claude's nested-invocation policy would block it anyway. +`claude | codex | gemini | opencode | unknown`. `resolve_roster` (called by +dispatch.py and benchmark.py) applies `host_rules` from config.yaml: the +matching host's reviewer is always skipped (**inside Claude Code the `claude` +reviewer is auto-skipped** — the nested-invocation policy would block it). +`--roster` names are explicit: disabled/custom_only gates are waived and +host_rules `add` does not inject extra reviewers; profile-based rosters get +the full policy (disabled, tier, privacy, custom_only, host add). ### Confidence filter + corroboration - Drop findings with effective confidence < 80. -- Bonus: +15 points when ≥2 reviewers flag the same `file:line` (capped at 100). -- Merge dedupes by exact `file:line`. Nearby-line corroboration is NOT done today — reviewers often disagree on line numbers by ±3 within the same diff hunk; this under-counts corroboration. Candidate v2 fix: bucket by file + ±3-line range. +- Bonus: +15 points when ≥2 reviewers flag the same issue (capped at 100). +- Merge clusters findings by file + anchor-based ±3-line proximity + (`defaults.merge_line_tolerance`, `--line-tolerance`); the reported line is + the cluster median. Do NOT re-implement "±3 bucketing" — it already exists + in `merge.py:_merge`. ### Cost gates - Review: warn $0.50, hard block $2.00 — overridable with `--yes-cost` @@ -163,9 +169,6 @@ auto-skipped** — Claude's nested-invocation policy would block it anyway. slow expensive frontier when F1 is comparable. ## v2 ideas (captured for later) -- **Nearby-line corroboration in merge.** Currently bucket by exact - `file:line`; reviewers often differ by ±3 within the same hunk. Bucket - by file + ±3-line range to increase corroboration accuracy. - **User-action capture.** After merged.md is shown, collect accepted/rejected per finding into history.db. Use this for usage-driven preference weights. - **`--refresh` command.** Query OpenRouter `/models` and diff against pinned @@ -180,9 +183,8 @@ auto-skipped** — Claude's nested-invocation policy would block it anyway. | MiniMax-M2.7 direct → works (slow, 20s for fixture) | — | — | | Kimi `KIMI_API_KEY` is consumer-only | forced OR primary (`moonshotai/kimi-k2.5`) | need Moonshot Platform dev key for k2.6-preview | | Nous Hermes direct → `NOUSRESEARCH_API_KEY` not in env | OR fallback works | set env var if wanted | -| Gemini CLI slow (60-70s/call) | works, use sparingly for quick reviews | no fix — CLI startup time | +| Gemini CLI disabled (timeout hang) | run_subprocess now tree-kills on timeout | re-test on Windows, then re-enable in config | | OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | set longer timeout in dispatch (already 180s) | -| Merge doesn't apply line-tolerance to corroboration | under-counts corroborated findings | v2: ±3-line bucketing | | Fixtures only cover 4 scenarios | leaderboard is noisy at N=4 | author more fixtures (auth bypass, XSS, null-deref, async races, TOCTOU, etc.) | ## Reviewer recommendations (initial, pre-benchmark) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index f2eaa03..920ad7d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -185,7 +185,7 @@ python scripts/estimate_cost.py --roster "glm-5.1,minimax-m2.7,gemini-or,codex" ## Known Gotchas -- **Windows .cmd shim tree-kill**: Node CLIs (gemini, copilot) don't tree-kill children on subprocess timeout. Use `gemini-or` instead. +- **Windows .cmd shim tree-kill**: fixed — `run_subprocess` kills the whole process tree on timeout (killpg on POSIX, `taskkill /T /F` on Windows). gemini-direct stays disabled until re-tested on Windows; `gemini-or` remains the default route. - **OpenRouter reasoning providers**: `z-ai/glm-5.1` and `minimax/minimax-m2.7` may route to providers returning `{content: null}`. Mitigation: aichat patch applies reasoning-exclude + provider-ignore. -- **Argv length on Windows (~32KB)**: gemini-cli and copilot-cli embed prompts as CLI args. Diffs >30KB fail. Use `--files` to scope. +- **Argv length on Windows (~32KB)**: fixed — all CLI adapters pipe the prompt via stdin; no adapter embeds it in argv. - **Full-codebase audit prompt mismatch**: Default prompt optimized for PR review. Use `--overlay audit` for empty-tree→HEAD diffs. diff --git a/README.md b/README.md index e325c4e..4b9826a 100644 --- a/README.md +++ b/README.md @@ -479,13 +479,13 @@ argus/ ``` ┌──────────────────────────────────────────────────────────────────────────┐ -│ Windows .cmd shim tree-kill problem │ +│ Windows .cmd shim tree-kill problem (FIXED — pending Windows re-test) │ │ ───────────────────────────────────── │ -│ Node CLIs (gemini, copilot) don't tree-kill children on subprocess │ -│ timeout. Causes zombie node.exe processes. │ +│ Node CLIs spawn a node child that used to survive subprocess timeout. │ +│ run_subprocess now kills the whole process tree (killpg / taskkill /T). │ │ │ -│ Mitigation: gemini-direct disabled by default; use gemini-or instead │ -│ (same family via OpenRouter). │ +│ gemini-direct stays disabled until re-tested on Windows; gemini-or │ +│ (same family via OpenRouter) remains the default route. │ ├──────────────────────────────────────────────────────────────────────────┤ │ OpenRouter reasoning-provider trap │ │ ────────────────────────────────── │ @@ -495,12 +495,12 @@ argus/ │ Mitigation: aichat patch applies reasoning-exclude + provider-ignore. │ │ config.yaml: aichat_clients.openrouter.patch.chat_completions │ ├──────────────────────────────────────────────────────────────────────────┤ -│ Argv length on Windows (~32KB) │ +│ Argv length on Windows (~32KB) — FIXED │ │ ────────────────────────────── │ -│ gemini-cli and copilot-cli embed the prompt as a command-line arg. │ -│ Diffs > 30KB fail. │ +│ All CLI adapters now pipe the prompt via stdin; no adapter embeds it │ +│ as a command-line arg anymore, so large diffs no longer hit ARG_MAX. │ │ │ -│ Mitigation: use --files to scope, or rely on aichat adapters (stdin). │ +│ (Historical: gemini-cli fixed 2026-05-06, copilot-cli fixed 2026-06.) │ ├──────────────────────────────────────────────────────────────────────────┤ │ Full-codebase audit prompt mismatch │ │ ─────────────────────────────────── │ diff --git a/SKILL.md b/SKILL.md index 3b9015a..e491e3f 100644 --- a/SKILL.md +++ b/SKILL.md @@ -97,8 +97,10 @@ If diff is empty after filtering, abort. python "$ARGUS_HOME/scripts/estimate_cost.py" --roster "$ROSTER" --diff "$RUN_DIR/diff.patch" ``` -Exit codes: 0 OK, 1 WARN (≥ warn), 2 BLOCK (≥ block). -On BLOCK without `--yes-cost`: stop. Report estimate to user. +Exit codes: 0 OK, 1 WARN (≥ warn threshold or low OpenRouter balance), +2 BLOCK (≥ block threshold) **or invalid roster** — check stderr to tell them +apart; `--yes-cost` downgrades a cost BLOCK to WARN but cannot fix an invalid +roster. On BLOCK without `--yes-cost`: stop. Report estimate to user. If `--dry-run`: stop here and print roster + estimate. Do not dispatch. From a36304b4dd4b0a9104dce74d65883ae81631e363 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 06:40:36 -0700 Subject: [PATCH 12/14] =?UTF-8?q?fix(review):=20address=20Copilot=20PR=20c?= =?UTF-8?q?omments=20=E2=80=94=20ts=20suffix=20boundary,=20exception=20stu?= =?UTF-8?q?b=20artifact?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stats.py --since now truncates both comparison sides to the common YYYYMMDDTHHMMSS prefix; a timezone suffix on either the row or the cutoff no longer skews exact-equality boundaries. dispatch.py writes reviews/.json for gather-level exception stubs too, preserving the every-roster-entry-produces-an-artifact contract merge.py depends on. Confidence: high Scope-risk: narrow --- scripts/dispatch.py | 13 +++++++++---- scripts/stats.py | 10 ++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/scripts/dispatch.py b/scripts/dispatch.py index 0e7ca8a..a10a45c 100644 --- a/scripts/dispatch.py +++ b/scripts/dispatch.py @@ -147,10 +147,15 @@ async def _bounded(name: str) -> dict: results = [] for n, r in zip(roster, gathered): if isinstance(r, BaseException): - results.append({"name": n, "findings": [], "exit_code": 1, - "error": f"{type(r).__name__}: {r}"}) - else: - results.append(r) + r = {"name": n, "findings": [], "exit_code": 1, + "error": f"{type(r).__name__}: {r}"} + # Keep the contract that every roster entry produces a + # reviews/.json — merge.py only reads those files. + try: + (reviews_dir / f"{n}.json").write_text(json.dumps(r, indent=2), encoding="utf-8") + except Exception: + pass + results.append(r) summary = { "roster": roster, diff --git a/scripts/stats.py b/scripts/stats.py index 54f4d41..4eec45e 100644 --- a/scripts/stats.py +++ b/scripts/stats.py @@ -32,10 +32,12 @@ def main() -> int: params: list = [] if args.since: # runs.ts is dashed ISO-8601 ('2026-06-10T12:00:00+00:00') while - # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' from - # both sides so the lexical compare is valid for either format. - where = "WHERE REPLACE(REPLACE(ts, '-', ''), ':', '') >= ?" - params = [args.since.replace("-", "").replace(":", "")] + # benchmarks.ts is compact ('20260610T120000'); strip '-' and ':' and + # truncate both sides to the common YYYYMMDDTHHMMSS prefix so neither + # a timezone suffix on the row nor one on the cutoff skews the + # lexical compare at exact-equality boundaries. + where = "WHERE SUBSTR(REPLACE(REPLACE(ts, '-', ''), ':', ''), 1, 15) >= ?" + params = [args.since.replace("-", "").replace(":", "")[:15]] # Per-reviewer aggregates from reviewer_runs cur.execute(f""" From cd30312bb89a77ee4562f0e9a9a9a3d942167219 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 06:54:36 -0700 Subject: [PATCH 13/14] docs: accuracy pass + diagrams across all five documents Fixes remaining stale claims: DEVELOPMENT.md and SKILL.md still described exact-file:line dedupe (merge clusters by anchor-based tolerance); the roster table's gemini/copilot/claude notes predated the stdin and tree-kill fixes; --timeout documented the old 180s default; the cost-flow chart drew OR-balance as a block in review mode (it warns); SKILL.md roster resolution was missing the disabled filter and the explicit-roster semantics; CLAUDE.md layout comments showed prompt-as-arg invocations. Adds: subagent-per-reviewer sequence diagram and leaderboard F1 bar chart (README); roster-policy matrix, subprocess-layer notes, benchmark-scoring flowchart, and history.db ER diagram (DEVELOPMENT.md); pytest/CI coverage in DEVELOPMENT.md, CONTRIBUTING.md, and the layout trees; estimate_cost exit-code reference. All 7 mermaid diagrams validated against the renderer. Confidence: high Scope-risk: narrow --- CLAUDE.md | 11 ++-- CONTRIBUTING.md | 9 +++ DEVELOPMENT.md | 148 +++++++++++++++++++++++++++++++++++++++++++++--- README.md | 85 ++++++++++++++++++++++----- SKILL.md | 13 ++++- 5 files changed, 239 insertions(+), 27 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 4172941..be5811a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -37,11 +37,11 @@ argus/ │ └── adapters/ │ ├── __init__.py # ROUTE_ADAPTERS registry │ ├── aichat.py # universal aichat wrapper (all 8 provider routes) -│ ├── gemini_cli.py # gemini --yolo -p "" +│ ├── gemini_cli.py # gemini --yolo -p "" (prompt via stdin) │ ├── codex_cli.py # codex exec --skip-git-repo-check - (stdin) │ ├── claude_cli.py # claude -p --output-format text --bare (stdin, nested-from-claude blocked) │ ├── opencode_cli.py # opencode run - (stdin) -│ └── copilot_cli.py # copilot -p "" --model X --allow-all-tools +│ └── copilot_cli.py # copilot -p "" --model X (prompt via stdin) ├── fixtures/ # benchmark inputs (diff.patch + ground-truth.json pairs) │ ├── sql-injection/ │ ├── race-refund/ @@ -143,7 +143,10 @@ the full policy (disabled, tier, privacy, custom_only, host add). ### JSON extraction `_common.extract_json` handles: raw JSON, `` reasoning blocks (Qwen3 / DeepSeek-R1 style), fenced code blocks, and embedded objects by -"findings" or "ok" key. Every aichat reviewer we tested returns usable JSON. +"findings" or "ok" key. The balanced-brace scanners track string/escape +state (braces inside descriptions are safe) and the last-resort fallback is +a single O(n) pass capped at 50 parse attempts. Every aichat reviewer we +tested returns usable JSON. ## Benchmark run protocol (learned from first full bench) @@ -184,7 +187,7 @@ the full policy (disabled, tier, privacy, custom_only, host add). | Kimi `KIMI_API_KEY` is consumer-only | forced OR primary (`moonshotai/kimi-k2.5`) | need Moonshot Platform dev key for k2.6-preview | | Nous Hermes direct → `NOUSRESEARCH_API_KEY` not in env | OR fallback works | set env var if wanted | | Gemini CLI disabled (timeout hang) | run_subprocess now tree-kills on timeout | re-test on Windows, then re-enable in config | -| OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | set longer timeout in dispatch (already 180s) | +| OpenCode CLI slow (42+s/call) | works, barely fits 45s verify timeout | dispatch timeout already 360s | | Fixtures only cover 4 scenarios | leaderboard is noisy at N=4 | author more fixtures (auth bypass, XSS, null-deref, async races, TOCTOU, etc.) | ## Reviewer recommendations (initial, pre-benchmark) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f2e3e4d..f462ea1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,11 +24,20 @@ New fixtures are valuable — they sharpen the benchmark signal. - Include `scripts/verify.py --json --roster ` output for reachability issues - Report `aichat --version`, Python version, OS +## Tests + +- `python -m pytest tests/ -q` must pass — CI runs it on every push/PR. + Tests need no network or API keys. +- Bug fixes in `_common.py` (extract_json, run_subprocess, merge scoring) + should come with a regression test alongside the existing ones in `tests/`. + ## Code style - Python 3.12+. Type hints where they clarify; not exhaustive. - No new dependencies without justification. pyyaml + psutil is the current bar. - Shell scripts must work on both Git Bash (Windows) and POSIX. +- Roster policy lives only in `_common.resolve_roster` — don't add inline + roster filters to individual scripts. ## Don't break diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 920ad7d..73d71ab 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -75,6 +75,8 @@ argus/ │ ├── opencode_cli.py │ └── copilot_cli.py ├── fixtures/ # Benchmark inputs (diff.patch + ground-truth.json) +├── tests/ # Unit tests (pytest; run in CI) +├── .github/workflows/ci.yml # GitHub Actions — pytest on push/PR ├── runs/ # Per-invocation artifacts (gitignored) ├── benchmarks/ # Leaderboard outputs (gitignored) └── history.db # SQLite — runs, findings, benchmarks (gitignored) @@ -141,31 +143,157 @@ python scripts/stats.py ## Architecture Notes +### Roster Policy — single source of truth + +All roster policy lives in `_common.resolve_roster` — dispatch.py and +benchmark.py both call it; **never add inline roster filters elsewhere**. +Semantics: + +| | explicit `--roster` names | profile-based roster | +|---|---|---| +| `disabled: true` | kept (explicit naming = intent) | dropped | +| `custom_only: true` | kept | dropped | +| `tier: free` | needs `--allow-free` | needs `--allow-free` | +| `privacy: LOGS` | needs `--allow-logging` | needs `--allow-logging` | +| host_rules `skip` | applied | applied | +| host_rules `add` | **not** applied | applied | +| not in registry | dropped | dropped | + +Drops are written to `dispatch_summary.json` under `"dropped"` and noted on +stderr. `estimate_cost.py` rejects unknown reviewer names outright (exit 2, +labeled `INVALID ROSTER` so it can't be mistaken for a cost block). + ### Host-CLI Awareness -`detect_host.py` inspects env + parent process tree. Returns `claude | codex | gemini | opencode | unknown`. The matching reviewer is removed from the roster; non-claude hosts auto-add `claude`. +`detect_host.py` inspects env markers + the parent process tree. Returns +`claude | codex | gemini | opencode | unknown`. Env markers are specific +variables (e.g. `CLAUDECODE=1`, `CODEX_SANDBOX`) — credential-shaped vars +like `CODEX_API_KEY` don't trigger detection. The process walk (psutil, up +to 8 levels) matches argv[0]/argv[1] basenames only, never full command +lines, so a wrapping shell containing `--roster codex` can't misdetect. ### Confidence Filter + Corroboration -- Findings with effective confidence < 80 are dropped -- +15 confidence bonus (cap 100) when >=2 reviewers flag the same file:line -- Merge dedupes by exact file:line +- Findings with effective confidence < 80 are dropped (`defaults.confidence_threshold`) +- +15 confidence bonus, cap 100, when >=2 reviewers agree (`defaults.corroboration_boost`) +- Merge clusters findings per file by anchor-based line proximity within + `defaults.merge_line_tolerance` (default ±3, `--line-tolerance` to override); + the cluster reports its median line, worst severity, and concatenated + descriptions. See README "Merge logic" for the dual-tolerance walk. ### Cost Gates | Mode | Warn | Hard block | Override | |------|------|-----------|----------| | review | $0.50 | $2.00 | `--yes-cost` | -| benchmark | $10 | $30 | `--yes-cost` | -| OR balance | (auto) | `available < safety × estimate` | `--skip-balance-check` | +| benchmark | $10 | $30 | `--yes-cost` / `ARGUS_YES_COST=1` | +| OR balance | warn in review mode | blocks in benchmark mode | `--skip-balance-check` | + +`estimate_cost.py` exit codes: `0` OK, `1` warn, `2` block or invalid roster +(check stderr). Paid-CLI reviewers (`cost_per_m: null`) count as $0. + +### Subprocess Layer + +`_common.run_subprocess` resolves argv[0] via a cached `shutil.which`, pipes +the prompt on **stdin** (never argv — Windows ARG_MAX), and on timeout kills +the entire process tree: children run in their own session, `os.killpg` on +POSIX, `taskkill /T /F` on Windows. Adapters return a uniform dict: +`{route, cmd, exit_code, stdout, stderr, latency_sec}`. + +`load_config()` is `lru_cache`d and returns a **shared dict — treat it as +read-only**. The prompt template read is cached too. ### JSON Extraction -`_common.extract_json` handles: raw JSON, `` reasoning blocks (Qwen3/DeepSeek-R1), fenced code blocks, and embedded objects by "findings" or "ok" key. +`_common.extract_json` tries, in order: raw parse → strip `` blocks +(Qwen3/DeepSeek-R1) → fenced code blocks → balanced object containing +`"findings"`/`"ok"` → a single O(n) string-aware pass over all balanced +`{...}` spans (max 50 parse attempts). The scanner tracks string/escape +state, so braces inside description values don't truncate the object. +`normalize_findings` clamps off-enum severities to `medium`. + +### Benchmark Scoring + +```mermaid +flowchart TB + A["dispatch reviewer x fixture x run"] --> B{"exit_code == 0?"} + B -->|no| Z["zero-score
P=R=F1=0, fn=len(issues)"] + B -->|yes| C{"JSON parsed?"} + C -->|"no (parse_error)"| Z + C -->|yes| D{"fixture has issues?"} + D -->|"no (clean baseline)"| E{"findings empty?"} + E -->|yes| P["perfect
P=R=F1=1.0"] + E -->|no| Z2["all false positives
P=R=F1=0"] + D -->|yes| F["match by file +
line within tolerance"] + F --> G["tp/fp/fn -> P, R, F1"] + + style Z fill:#ffb6c1,stroke:#c71585 + style Z2 fill:#ffb6c1,stroke:#c71585 + style P fill:#98fb98,stroke:#2e8b57 +``` + +The clean-baseline `1.0` applies **only to successful, parseable runs** — +a reviewer whose call failed or returned prose is zero-scored everywhere, +so broken reviewers can't rank above working ones. `aggregate_bench.py` +applies the same rule when re-scoring per-reviewer JSONs from +parallel-shell runs. Benchmark timestamps are UTC. + +### history.db Schema + +```mermaid +erDiagram + runs ||--o{ reviewer_runs : "run_id" + runs ||--o{ findings : "run_id" + runs { + TEXT run_id PK + TEXT ts "UTC ISO-8601" + TEXT roster + TEXT diff_sha256 + REAL latency_sec + } + reviewer_runs { + TEXT run_id PK,FK + TEXT reviewer PK + TEXT route + INTEGER exit_code + INTEGER fallback_used + REAL latency_sec + INTEGER n_findings + TEXT error + } + findings { + TEXT run_id PK,FK + INTEGER idx PK + TEXT file + INTEGER line + TEXT severity + INTEGER confidence + INTEGER n_reviewers + TEXT reviewers + } + benchmarks { + TEXT ts PK "UTC compact YYYYMMDDTHHMMSS" + TEXT reviewer PK + TEXT fixture PK + INTEGER run_idx PK + REAL precision + REAL recall + REAL f1 + REAL latency_sec + TEXT error + } +``` + +`benchmarks` is keyed independently (no FK to `runs`) — review runs and +benchmark runs are separate timelines. `stats.py --since` normalizes both +timestamp formats to a common `YYYYMMDDTHHMMSS` prefix before comparing. ## Testing ```bash +# Unit tests (no network, no API keys; also run by CI on every push/PR) +python -m pytest tests/ -q + # Verify all routes are reachable python scripts/verify.py --all @@ -177,6 +305,12 @@ python scripts/estimate_cost.py --roster "glm-5.1,minimax-m2.7,gemini-or,codex" --diff <(git diff HEAD) ``` +Unit coverage: `extract_json` edge cases (think-blocks, fences, +braces-in-strings, O(n) garbage handling), merge clustering/corroboration, +benchmark scoring, and `run_subprocess` timeout + process-tree kill. +Always dry-run a benchmark (`--runs 1 --fixtures `) with a new roster +before a full run — it catches provider-config bugs in ~30s instead of 40min. + ## Code Style - Python 3.12+. Type hints where they clarify; not exhaustive. diff --git a/README.md b/README.md index 4b9826a..982df4d 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,30 @@ Argus has two dispatch modes; the calling agent picks one per run: `dispatch.py --roster a,b,c,d` once. Argus's internal `defaults.max_parallel: 4` (config.yaml) still applies. +```mermaid +sequenceDiagram + participant Agent as calling agent + participant S1 as subagent 1..4 (wave) + participant D as dispatch.py + participant R as reviewer CLI / API + participant M as merge.py + Agent->>Agent: resolve roster + estimate cost + par one subagent per reviewer (max 4) + Agent->>S1: dispatch reviewer N + S1->>D: dispatch.py --roster NAME --run-dir RUN_DIR + D->>D: resolve_roster (policy + host rules) + D->>R: prompt via stdin + R-->>D: JSON findings + D->>D: write reviews/NAME.json + D-->>S1: dispatch_summary.json + end + Agent->>M: merge.py --run-dir RUN_DIR (exactly once) + M-->>Agent: merged.md + metrics.json + history.db row +``` + +Every roster entry produces a `reviews/.json` — timeouts, crashes, and +parse failures included — so `merge.py` always sees the full picture. + See [`SKILL.md` step 6](SKILL.md) for the canonical execution recipe. ## Why @@ -70,9 +94,9 @@ flowchart LR B --> C["roster resolution"] C --> D{"host detection"} D -->|claude| E1["skip claude"] - D -->|codex| E2["skip codex, add claude"] - D -->|gemini| E3["skip gemini, add claude"] - D -->|opencode| E4["skip opencode, add claude"] + D -->|codex| E2["skip codex (+claude for profiles)"] + D -->|gemini| E3["skip gemini (+claude for profiles)"] + D -->|opencode| E4["skip opencode (+claude for profiles)"] D -->|unknown| E5["no change"] E1 --> F E2 --> F @@ -109,12 +133,12 @@ flowchart LR | `grok-4.20` | aichat → OR `x-ai/grok-4.20` | 2M ctx, pricey | | `deepseek-v3.2` | aichat → OR `deepseek/deepseek-v3.2` | cheap + fast | | `gemini-or` | aichat → OR `google/gemini-2.5-flash` | 2s/call, best value | -| `gemini` | `gemini` CLI (paid sub) | disabled on Windows (.cmd tree-kill issue) | +| `gemini` | `gemini` CLI (paid sub) | disabled pending Windows re-test of the tree-kill fix | | `codex` | `codex` CLI (paid sub) | GPT-5.x, thorough, slow | -| `claude` | `claude` CLI (paid sub) | auto-added when host ≠ claude | +| `claude` | `claude` CLI (paid sub) | auto-added to **profile** rosters when host ≠ claude | | `opencode` | `opencode` CLI (paid sub) | top performer, slow cold start | | `hermes-4.3` | aichat → Nous (fallback OR) | custom-only | -| `copilot-gpt5` | GitHub `copilot` CLI | **disabled** — harness returns prose not JSON | +| `copilot-gpt5` | GitHub `copilot` CLI | **disabled** — returned prose under the old prompt-as-arg invocation; re-test with the new stdin invocation | ## Profiles @@ -148,7 +172,21 @@ flowchart LR | 10 | `hermes-4.3` | 0.551 | 0.646 | 0.653 | 13 | | 11 | `kimi-k2.6` | 0.505 | 0.729 | 0.575 | 83 | -Your numbers will differ. Run `--benchmark` on your fixtures. +```mermaid +xychart-beta + title "F1 by reviewer (4 fixtures x 3 runs)" + x-axis ["opencode", "qwen-3.6", "glm-5.1", "gemini-or", "minimax", "mimo-v2", "codex", "deepseek", "grok-4.20", "hermes", "kimi-k2.6"] + y-axis "F1" 0 --> 1 + bar [0.811, 0.761, 0.697, 0.681, 0.674, 0.652, 0.581, 0.572, 0.557, 0.551, 0.505] +``` + +Speed is a separate axis — `gemini-or` and `grok-4.20` answer in ~2s while +`kimi-k2.6` takes ~83s for a *lower* F1; cost/latency/quality trade-offs are +yours to pick per profile. + +Your numbers will differ. Run `--benchmark` on your fixtures. Failed or +unparseable reviewer calls are zero-scored — a broken reviewer can't climb +the board by "finding nothing" on the clean-baseline control. --- @@ -246,11 +284,12 @@ python scripts/merge.py --run-dir "$RUN_DIR" | `--custom LIST` / `--models LIST` | one-off roster | | `--pr URL` / `--files GLOB` / `-` | diff source | | `--overlay {security,deep,audit}` | prompt overlay | -| `--timeout N` | override 180s per-reviewer timeout | -| `--yes-cost` / `ARGUS_YES_COST=1` | bypass cost + OR balance blocks | +| `--timeout N` | override the 360s per-reviewer timeout (`defaults.reviewer_timeout_sec`) | +| `--yes-cost` / `ARGUS_YES_COST=1` | downgrade a cost block to a warning | | `--skip-balance-check` | skip OR balance pre-flight | | `--allow-free` | include free-tier reviewers | | `--allow-logging` | include reviewers that log prompts | +| `--line-tolerance N` | merge clustering tolerance (`defaults.merge_line_tolerance`, default 3) | | `--output {md,json,gsd}` | output format (`gsd` → `REVIEW.md` for `gsd-code-review-fix`) | | `--save-as NAME` | persist `--custom` roster as profile | @@ -288,6 +327,8 @@ flowchart TB **Why dual tolerance.** A single forward-walking check causes chain drift — findings at lines 10, 13, 16, 19 would all collapse into one cluster (each within ±3 of the previous) even though lines 10 and 19 are 9 apart. Anchor-check alone handles this, but can reject near-duplicates that happen to land just over the anchor tolerance. Both checks together: tight when cluster stays compact, rejects drift when it doesn't. +The tolerance is configurable: `defaults.merge_line_tolerance` in config.yaml, or `--line-tolerance` per run. The reported line for a cluster is the **median** of its members; the worst severity in the cluster wins; descriptions from each reviewer are concatenated, highest confidence first. + --- ## Cost control @@ -298,20 +339,27 @@ Two independent gates, both enforceable: flowchart LR A["Roster + diff"] --> B["Estimate tokens x rates"] B --> C{"est >= threshold?
review: $0.50 / $2
bench: $10 / $30"} - C -->|at or above block| X1["BLOCK
override: --yes-cost"] + C -->|at or above block| X1["BLOCK exit 2
override: --yes-cost"] C -->|below block| D{"any OR reviewer
in roster?"} D -->|no| F["dispatch"] D -->|yes| E["probe OR balance
auth/key + credits"] E --> G{"available >=
safety x est?"} - G -->|no| X2["BLOCK
override: --yes-cost
or --skip-balance-check"] + G -->|"no (review mode)"| W["WARN exit 1
top up before dispatching"] + G -->|"no (benchmark mode)"| X2["BLOCK
override: --yes-cost"] G -->|yes| F + W --> F F --> Z["run"] style X1 fill:#ffb6c1,stroke:#c71585 style X2 fill:#ffb6c1,stroke:#c71585 + style W fill:#ffe4b5,stroke:#b8860b style Z fill:#98fb98,stroke:#2e8b57 ``` +`estimate_cost.py` exit codes: `0` OK · `1` warn (soft threshold or low OR +balance) · `2` block **or invalid roster** (unknown reviewer names fail fast — +check stderr; `--yes-cost` overrides a cost block only). + | Mode | Warn | Hard block | Override | |---|---|---|---| | review | $0.50 | $2.00 | `--yes-cost` / `ARGUS_YES_COST=1` | @@ -332,7 +380,9 @@ Paid-CLI reviewers (Gemini, Codex, Claude, OpenCode, Copilot) incur no tracked c | opencode | `opencode` | `claude` | | unknown | — | — | -Rule: never ask the host CLI to review its own invocation. Always ensure Claude is in the mix when Claude isn't the host. Detection via env vars + parent-process walk (psutil, up to 8 levels). +Rule: never ask the host CLI to review its own invocation — the `skip` column always applies. The `add` column applies to **profile-based rosters only**: an explicit `--roster` list is treated as intent and never silently gains (or keeps `disabled:`/`custom_only:` blocks against) reviewers you named yourself. Anything dropped is recorded in `dispatch_summary.json` under `"dropped"`. + +Detection: specific env markers per host (e.g. `CLAUDECODE=1`; a stray `CODEX_API_KEY` in your shell profile won't trigger it), then a parent-process walk (psutil, up to 8 levels) matching executable names only — never full command lines. --- @@ -364,7 +414,7 @@ Produces: - `benchmarks/.json` — full machine-readable data - rows in `history.db:benchmarks` — longitudinal comparison -The aggregator re-scores from `tp/fp/fn` per run, correctly handles clean-baseline (P=R=F1=1.0 when reviewer correctly finds nothing), and produces the unified leaderboard. +The aggregator re-scores from `tp/fp/fn` per run, correctly handles clean-baseline (P=R=F1=1.0 when a **successful** reviewer correctly finds nothing), zero-scores failed/timed-out/unparseable calls, and produces the unified leaderboard. --- @@ -416,7 +466,7 @@ Seeded fixtures: ## Quality mechanics (summary) -1. **Strict-schema JSON** — reviewers return `{findings: [{file, line, severity, category, description, confidence}]}`. Extractor tolerates fenced blocks, `` prefixes, stray prose. +1. **Strict-schema JSON** — reviewers return `{findings: [{file, line, severity, category, description, confidence}]}`. Extractor tolerates fenced blocks, `` prefixes, stray prose, and braces inside string values; the last-resort scan is a single O(n) pass with bounded parse attempts, so garbage output can't stall a run. Off-enum severities are clamped to `medium` rather than dropped. 2. **Context-window pre-check** — skip reviewer if prompt > 70% of their ctx (reviewer's `skip_reason` logged; other reviewers continue). 3. **Confidence threshold** — drop findings with effective confidence < 80. 4. **Anchor-based clustering + corroboration boost** — +15 confidence (cap 100) when ≥ 2 reviewers agree within ±3 lines on the same file. @@ -424,6 +474,7 @@ Seeded fixtures: 6. **OpenRouter reasoning-exclude** — `patch.chat_completions` applies `reasoning: {exclude: true, max_tokens: 2000}` + `provider: {ignore: [io.net, together.ai]}` to avoid providers that return `content: null` with reasoning-only. 7. **Per-reviewer incremental writes** — in benchmark mode, reviewer JSONs land as each reviewer completes. Tailable during runs. 8. **Fallback routing** — every reviewer has an optional fallback (typically OR); sum of primary + fallback latency is reported so you see real wall time. +9. **Failure isolation** — one broken reviewer never kills the run: per-reviewer exception handling, `gather(return_exceptions=True)`, and a guaranteed `reviews/.json` artifact for every roster entry. On timeout the entire subprocess tree is killed (`killpg` / `taskkill /T /F`) — no orphaned node processes. --- @@ -468,6 +519,12 @@ argus/ │ ├── race-refund/ │ ├── secrets-leak/ │ └── clean-baseline/ +├── tests/ # unit tests (pytest; run in CI) +│ ├── test_extract_json.py +│ ├── test_merge.py +│ ├── test_run_subprocess.py +│ └── test_score.py +├── .github/workflows/ci.yml # GitHub Actions — pytest on push/PR ├── runs/ # per-invocation artifacts (gitignored) ├── benchmarks/ # leaderboard outputs (gitignored) └── history.db # SQLite — runs, findings, benchmarks (gitignored) diff --git a/SKILL.md b/SKILL.md index e491e3f..1493303 100644 --- a/SKILL.md +++ b/SKILL.md @@ -69,8 +69,11 @@ Apply `host_rules[$HOST]` from config.yaml: remove `skip` list, append `add` lis 1. Load config.yaml 2. Base list from `--profile` / `--custom` / `--models` / default profile -3. Apply host rules (step 2 above) +3. Apply host rules (step 2 above). For explicitly-named rosters + (`--custom` / `--models`) apply only the `skip` list — never inject + `add` reviewers the user didn't ask for. 4. Apply filters: + - drop reviewers with `disabled: true` unless explicitly named - drop reviewers with `tier: free` unless `--allow-free` - drop reviewers with `privacy: LOGS` unless `--allow-logging` - drop reviewers with `custom_only: true` unless explicitly named @@ -78,6 +81,11 @@ Apply `host_rules[$HOST]` from config.yaml: remove `skip` list, append `add` lis If roster is empty, abort with message and list disabled reasons. +Defense in depth: `dispatch.py` and `benchmark.py` enforce this same policy +internally via `_common.resolve_roster` (drops recorded in +`dispatch_summary.json` under `"dropped"`), and `estimate_cost.py` rejects +unknown reviewer names — so a skipped filter here fails safe downstream. + ## 4. Gather scope → diff Produce a single diff.patch in the new run directory: @@ -158,7 +166,8 @@ run `merge.py` exactly once in the main session (step 7). python "$ARGUS_HOME/scripts/merge.py" --run-dir "$RUN_DIR" ``` -Applies confidence threshold, corroboration boost, dedupes by file:line, +Applies the confidence threshold, corroboration boost, and anchor-based +±3-line clustering (`defaults.merge_line_tolerance` / `--line-tolerance`), writes `merged.md` and `metrics.json`. Also appends a row to `history.db`. ## 8. Present From 3fb38ae55729e36f397b7c5e6485ad6e594030a6 Mon Sep 17 00:00:00 2001 From: Jim Stratus Date: Wed, 10 Jun 2026 07:04:10 -0700 Subject: [PATCH 14/14] fix(review): require a findings list before treating parsed JSON as a valid review extract_json can recover an arbitrary inner object from prose, so a parsed dict without a findings list is still a failed review: benchmark mode now sets parse_error (zero-scored instead of F1=1.0 on clean-baseline) and dispatch records parse_error + raw_preview instead of silently reporting zero findings. A legitimate {"findings": []} still counts as success. Confidence: high Scope-risk: narrow --- scripts/benchmark.py | 6 ++++-- scripts/dispatch.py | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index dfb3d32..624b842 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -144,8 +144,10 @@ async def _dispatch(name: str, spec: dict, prompt: str, timeout: int) -> dict: parse_error = False if r["exit_code"] == 0: parsed = extract_json(r["stdout"]) - if isinstance(parsed, dict): - findings = normalize_findings(parsed.get("findings", [])) + # Require the schema's findings list — extract_json can recover an + # arbitrary inner object from prose, which is still a failed review. + if isinstance(parsed, dict) and isinstance(parsed.get("findings"), list): + findings = normalize_findings(parsed["findings"]) else: parse_error = True total_latency = primary_latency + fallback_latency diff --git a/scripts/dispatch.py b/scripts/dispatch.py index a10a45c..a3b0c75 100644 --- a/scripts/dispatch.py +++ b/scripts/dispatch.py @@ -75,13 +75,14 @@ async def _dispatch_one(name: str, spec: dict, prompt: str, timeout: int) -> dic return result parsed = extract_json(stdout) - if parsed is None: + # Require the schema's findings list — extract_json can recover an + # arbitrary inner object from prose, which is still a failed review. + if not isinstance(parsed, dict) or not isinstance(parsed.get("findings"), list): result["parse_error"] = True result["raw_preview"] = stdout[:2000] return result - findings_raw = parsed.get("findings") if isinstance(parsed, dict) else None - result["findings"] = normalize_findings(findings_raw or []) + result["findings"] = normalize_findings(parsed["findings"]) return result