stylusnexus · evemcgivern · Jun 17, 2026 · Jun 16, 2026 · Jun 17, 2026 · Jun 17, 2026
@@ -45,6 +45,10 @@ skills/scan/lib/detect.sh tool <name> <cwd>    # resolve an analyzer binary (exi
 skills/scan/lib/detect.sh issues <terms>       # search tracker (correlation/dedup; read; exit 3 if no gh)
 skills/scan/lib/detect.sh labels               # list repo labels (for label/priority proposal; exit 3 if no gh)
 skills/scan/lib/detect.sh issues-create <title> <body-file> [labels]   # file an issue (write; exit 3 if no gh)
+
+# Maintainer eval loop (#15) — measure, don't regress. Both model-free:
+skills/scan/lib/detect.sh eval <corpus-dir> <findings-file>   # grader: score a findings file (P/R/tp/fp/fn)
+scripts/eval-run <lang> [--runs N] [--split seen|held-out|all] [--update-baseline]   # orchestrate: scan fixtures via a runner, then grade
 ```
 
 Run a single test by description: `bats tests/detect.bats -f "triage: ranks"`.
@@ -62,11 +66,17 @@ the model. Do not push reasoning into `detect.sh`, and do not re-implement in pr
 what a `detect.sh` subcommand already provides.
 
 **Self-improvement is measured, not learned.** `tests/eval/<lang>/` is a labeled
-fixture corpus; `detect.sh eval <corpus-dir> <findings-file>` is a **model-free**
-grader (precision/recall/tp/fp/fn). Improvement happens only via human-reviewed PRs
-that add fixtures/checks and must not regress the eval — there is deliberately **no
-runtime learning store** (that would be the P4 prompt-injection surface). The grader
-+ corpus are CODEOWNERS-protected so a PR can't silently weaken them. See issue #15.
+fixture corpus; eval has two model-free layers. `detect.sh eval <corpus-dir>
+<findings-file>` is the **grader** (precision/recall/tp/fp/fn) — it scores a findings
+file, it does not run the scan. `detect.sh eval-run <lang>` (wrapper: `scripts/eval-run`)
+is the **orchestrator** — itself model-free, it scans each fixture via a *swappable*
+runner (`DEFECT_SCAN_EVAL_RUNNER` → `tests/eval/runners/{claude,codex}.sh`, the only
+place a model enters), accumulates findings, then grades the split once with `eval`
+and aggregates mean/stddev against the per-lang `baseline.{seen,held-out}.txt`.
+Improvement happens only via human-reviewed PRs that add fixtures/checks and must not
+regress the baseline — there is deliberately **no runtime learning store** (that would
+be the P4 prompt-injection surface). The grader + corpus are CODEOWNERS-protected so a
+PR can't silently weaken them. See issue #15.
 
 **Five stages** (`SKILL.md` is the orchestrator): detect → triage → tool pass →
 reasoning pass → report (→ fix). `--depth N` (default 20) caps how many triaged

@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
@@ -1,6 +1,7 @@
 import java.io.*;
-class BugPathTraversal {
-  File open(String name) {
-    return new File("/data/" + name);   // user-controlled name -> traversal
-  }
+import java.nio.file.*;
+public class BugPathTraversal {
+    byte[] load(String name) throws IOException {
+        return Files.readAllBytes(Paths.get("/data/" + name));  // untrusted name -> realized path-traversal read
+    }
 }
@@ -1 +1 @@
-4:cat#3
+5:cat#3
@@ -0,0 +1,6 @@
+public class BugSwallowedParse {
+    int port(String s) {
+        try { return Integer.parseInt(s); }
+        catch (NumberFormatException e) { return 0; }   // swallowed: invalid input silently becomes 0
+    }
+}
@@ -0,0 +1 @@
+4:cat#2
@@ -0,0 +1,5 @@
+public class CleanReportedParse {
+    int port(String s) {
+        return Integer.parseInt(s);   // NumberFormatException propagates to caller; nothing swallowed
+    }
+}
@@ -0,0 +1,8 @@
+import java.io.*;
+import java.nio.file.*;
+public class CleanSafePath {
+    byte[] load(String name) throws IOException {
+        if (!name.matches("[a-z0-9]+")) throw new IllegalArgumentException("bad name");  // strict whitelist
+        return Files.readAllBytes(Paths.get("/data").resolve(name));   // no separators possible -> no traversal
+    }
+}
@@ -15,15 +15,25 @@ if [ -n "$labels" ]; then
 else
   labelinstr="category = cat#1..cat#5 or a language-specific label"
 fi
+# The labels above are opaque IDs (cat#1..5). The headless run is sandboxed to the temp
+# work dir, so the model CANNOT read the skill's baseline-categories.md to learn what they
+# mean — without the definitions it thrashes on denied reads (→ missing EVAL block, PARTIAL)
+# and guesses categories (cat#2-vs-cat#3 flips). Inject the definitions inline from the dev
+# repo so the run is self-contained. See issue #68.
+legend="$(awk '/^## [0-9]+\./ { n=$2; sub(/\./,"",n); t=$0; sub(/^## [0-9]+\. /,"",t); sub(/  .*/,"",t); printf "cat#%s = %s; ", n, t }' "$(dirname "$detect")/../baseline-categories.md" 2>/dev/null)"
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                        # SOURCE only — never the .expected sidecar
 cd "$work"
 # Read-only: deny mutating tools so a runner can never edit the repo under test.
 "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang.
+The category definitions are provided inline below, so do NOT read baseline-categories.md,
+eval-mode.md, or any other skill file, and skip the git/correlation/tool-resolution stages —
+reason directly about the single file in your current directory.
+Category definitions: $legend
 After the normal report, append EXACTLY ONE machine block for the grader:
 a line \"<<<EVAL\", then one line per finding as \"<path>:<line>:<category>\"
 (path = the file's basename; line = integer). $labelinstr
 Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with
-nothing between. Do not omit the block." \
+nothing between. Always emit the block; never omit it." \
   --permission-mode plan \
   --disallowedTools "Edit,Write,NotebookEdit"
@@ -15,16 +15,24 @@ if [ -n "$labels" ]; then
 else
   labelinstr="category = cat#1..cat#5 or a language-specific label"
 fi
+# Opaque label IDs (cat#1..5) need their definitions inline so the model labels by meaning,
+# not by guessing the numbers — keeps this runner self-contained and parallel with
+# claude.sh (divergence between runners is a bug). See issue #68.
+legend="$(awk '/^## [0-9]+\./ { n=$2; sub(/\./,"",n); t=$0; sub(/^## [0-9]+\. /,"",t); sub(/  .*/,"",t); printf "cat#%s = %s; ", n, t }' "$(dirname "$detect")/../baseline-categories.md" 2>/dev/null)"
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                       # SOURCE only — never the .expected sidecar
 prompt="$(mktemp)"
 {
   echo "Run /defect-scan:scan on the file in this directory with --lang $lang."
+  echo "The category definitions are provided inline below, so do NOT read baseline-categories.md,"
+  echo "eval-mode.md, or any other skill file, and skip the git/correlation/tool-resolution stages —"
+  echo "reason directly about the single file in this directory."
+  echo "Category definitions: $legend"
   echo "After the normal report, append EXACTLY ONE machine block for the grader:"
   echo 'a line "<<<EVAL", then one line per finding as "<path>:<line>:<category>"'
   echo "(path = the file's basename; line = integer). $labelinstr"
   echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with"
-  echo "nothing between. Do not omit the block."
+  echo "nothing between. Always emit the block; never omit it."
 } > "$prompt"
 cd "$work"
 "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt"