From b456dbca33c4cd8981913d7e1cc1f6d60f0d6830 Mon Sep 17 00:00:00 2001
From: Eve McGivern <evemcgivern@gmail.com>
Date: Tue, 16 Jun 2026 06:51:10 -0500
Subject: [PATCH 1/2] fix(eval): runners inject the language's valid label set
 (eval-categories) (#15)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The eval-mode prompt named 'a language-specific label' without enumerating the
valid ones, so for custom-label languages the model invented synonyms (panic->
unwrap-panic, quoting->SC2086, coerce->norway-problem) that the exact-match grader
scored as FP+FN — artificially depressing rust/shell/yaml to 0.67 and java held-out
to 0.50 (a FALSE overfit flag). Runners now run 'detect.sh eval-categories <lang>'
and inject the exact valid set. Verified: rust unwrap now emits '3:panic' (was
'unwrap-panic'), an exact match.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tests/detect.bats            |  4 ++++
 tests/eval/runners/claude.sh | 16 +++++++++++++---
 tests/eval/runners/codex.sh  | 16 +++++++++++++---
 3 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/tests/detect.bats b/tests/detect.bats
index 561fc4e..ffd31ff 100644
--- a/tests/detect.bats
+++ b/tests/detect.bats
@@ -1386,4 +1386,8 @@ EOF
   grep -q "<<<EVAL" "$root/tests/eval/runners/codex.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/claude.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/codex.sh"
+  # each runner must inject the language's valid label set (eval-categories) so the
+  # model emits grader-matchable labels instead of synonyms (panic vs unwrap-panic)
+  grep -q "eval-categories" "$root/tests/eval/runners/claude.sh"
+  grep -q "eval-categories" "$root/tests/eval/runners/codex.sh"
 }
diff --git a/tests/eval/runners/claude.sh b/tests/eval/runners/claude.sh
index 09f58d4..0963f69 100755
--- a/tests/eval/runners/claude.sh
+++ b/tests/eval/runners/claude.sh
@@ -5,6 +5,16 @@
 set -eu
 fixture="${1:?claude.sh: need fixture path}"; lang="${2:?claude.sh: need lang}"
 cc="${DEFECT_SCAN_CLAUDE:-claude}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                        # SOURCE only — never the .expected sidecar
 cd "$work"
@@ -12,8 +22,8 @@ cd "$work"
 "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang.
 After the normal report, append EXACTLY ONE machine block for the grader:
 a line \"<<<EVAL\", then one line per finding as \"<path>:<line>:<category>\"
-(path = the file's basename; line = integer; category = cat#1..cat#5 or a
-language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the
-two sentinel lines with nothing between. Do not omit the block." \
+(path = the file's basename; line = integer). $labelinstr
+Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with
+nothing between. Do not omit the block." \
   --permission-mode plan \
   --disallowedTools "Edit,Write,NotebookEdit"
diff --git a/tests/eval/runners/codex.sh b/tests/eval/runners/codex.sh
index fc42b12..cb81d84 100755
--- a/tests/eval/runners/codex.sh
+++ b/tests/eval/runners/codex.sh
@@ -5,6 +5,16 @@
 set -eu
 fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}"
 cx="${DEFECT_SCAN_CODEX:-codex}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                       # SOURCE only — never the .expected sidecar
 prompt="$(mktemp)"
@@ -12,9 +22,9 @@ prompt="$(mktemp)"
   echo "Run /defect-scan:scan on the file in this directory with --lang $lang."
   echo "After the normal report, append EXACTLY ONE machine block for the grader:"
   echo 'a line "<<<EVAL", then one line per finding as "<path>:<line>:<category>"'
-  echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a"
-  echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the"
-  echo "two sentinel lines with nothing between. Do not omit the block."
+  echo "(path = the file's basename; line = integer). $labelinstr"
+  echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with"
+  echo "nothing between. Do not omit the block."
 } > "$prompt"
 cd "$work"
 "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt"

From 6d047df2bb1b3a2095b53087bdc8f5af980d0ea3 Mon Sep 17 00:00:00 2001
From: Eve McGivern <evemcgivern@gmail.com>
Date: Tue, 16 Jun 2026 08:24:31 -0500
Subject: [PATCH 2/2] chore(eval): record measured seen baselines for
 rust/shell/yaml/swift (#15)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-measured with the label-fixed runner: all four now 1.00/1.00 (±0.00). rust/shell/
yaml were the 0.67 synonym-label artifacts (now resolved by injecting eval-categories);
swift's earlier PARTIAL transient cleared. Completes measured SEEN baselines for all 13
languages. java held-out (0.25, persistent overfit FLAG) intentionally NOT committed —
fixture-quality issue tracked separately; it keeps its conservative placeholder.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tests/eval/rust/baseline.seen.txt  | 4 ++--
 tests/eval/shell/baseline.seen.txt | 4 ++--
 tests/eval/swift/baseline.seen.txt | 4 ++--
 tests/eval/yaml/baseline.seen.txt  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/eval/rust/baseline.seen.txt b/tests/eval/rust/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/rust/baseline.seen.txt
+++ b/tests/eval/rust/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/shell/baseline.seen.txt b/tests/eval/shell/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/shell/baseline.seen.txt
+++ b/tests/eval/shell/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/swift/baseline.seen.txt b/tests/eval/swift/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/swift/baseline.seen.txt
+++ b/tests/eval/swift/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/yaml/baseline.seen.txt b/tests/eval/yaml/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/yaml/baseline.seen.txt
+++ b/tests/eval/yaml/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15