stylusnexus · evemcgivern · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
@@ -1386,4 +1386,8 @@ EOF
   grep -q "<<<EVAL" "$root/tests/eval/runners/codex.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/claude.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/codex.sh"
+  # each runner must inject the language's valid label set (eval-categories) so the
+  # model emits grader-matchable labels instead of synonyms (panic vs unwrap-panic)
+  grep -q "eval-categories" "$root/tests/eval/runners/claude.sh"
+  grep -q "eval-categories" "$root/tests/eval/runners/codex.sh"
 }
@@ -5,15 +5,25 @@
 set -eu
 fixture="${1:?claude.sh: need fixture path}"; lang="${2:?claude.sh: need lang}"
 cc="${DEFECT_SCAN_CLAUDE:-claude}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                        # SOURCE only — never the .expected sidecar
 cd "$work"
 # Read-only: deny mutating tools so a runner can never edit the repo under test.
 "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang.
 After the normal report, append EXACTLY ONE machine block for the grader:
 a line \"<<<EVAL\", then one line per finding as \"<path>:<line>:<category>\"
-(path = the file's basename; line = integer; category = cat#1..cat#5 or a
-language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the
-two sentinel lines with nothing between. Do not omit the block." \
+(path = the file's basename; line = integer). $labelinstr
+Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with
+nothing between. Do not omit the block." \
   --permission-mode plan \
   --disallowedTools "Edit,Write,NotebookEdit"
@@ -5,16 +5,26 @@
 set -eu
 fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}"
 cx="${DEFECT_SCAN_CODEX:-codex}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                       # SOURCE only — never the .expected sidecar
 prompt="$(mktemp)"
 {
   echo "Run /defect-scan:scan on the file in this directory with --lang $lang."
   echo "After the normal report, append EXACTLY ONE machine block for the grader:"
   echo 'a line "<<<EVAL", then one line per finding as "<path>:<line>:<category>"'
-  echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a"
-  echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the"
-  echo "two sentinel lines with nothing between. Do not omit the block."
+  echo "(path = the file's basename; line = integer). $labelinstr"
+  echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with"
+  echo "nothing between. Do not omit the block."
 } > "$prompt"
 cd "$work"
 "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt"

@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15