diff --git a/tests/detect.bats b/tests/detect.bats
index 561fc4e..ffd31ff 100644
--- a/tests/detect.bats
+++ b/tests/detect.bats
@@ -1386,4 +1386,8 @@ EOF
   grep -q "<<<EVAL" "$root/tests/eval/runners/codex.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/claude.sh"
   ! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/codex.sh"
+  # each runner must inject the language's valid label set (eval-categories) so the
+  # model emits grader-matchable labels instead of synonyms (panic vs unwrap-panic)
+  grep -q "eval-categories" "$root/tests/eval/runners/claude.sh"
+  grep -q "eval-categories" "$root/tests/eval/runners/codex.sh"
 }
diff --git a/tests/eval/runners/claude.sh b/tests/eval/runners/claude.sh
index 09f58d4..0963f69 100755
--- a/tests/eval/runners/claude.sh
+++ b/tests/eval/runners/claude.sh
@@ -5,6 +5,16 @@
 set -eu
 fixture="${1:?claude.sh: need fixture path}"; lang="${2:?claude.sh: need lang}"
 cc="${DEFECT_SCAN_CLAUDE:-claude}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                        # SOURCE only — never the .expected sidecar
 cd "$work"
@@ -12,8 +22,8 @@ cd "$work"
 "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang.
 After the normal report, append EXACTLY ONE machine block for the grader:
 a line \"<<<EVAL\", then one line per finding as \"<path>:<line>:<category>\"
-(path = the file's basename; line = integer; category = cat#1..cat#5 or a
-language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the
-two sentinel lines with nothing between. Do not omit the block." \
+(path = the file's basename; line = integer). $labelinstr
+Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with
+nothing between. Do not omit the block." \
   --permission-mode plan \
   --disallowedTools "Edit,Write,NotebookEdit"
diff --git a/tests/eval/runners/codex.sh b/tests/eval/runners/codex.sh
index fc42b12..cb81d84 100755
--- a/tests/eval/runners/codex.sh
+++ b/tests/eval/runners/codex.sh
@@ -5,6 +5,16 @@
 set -eu
 fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}"
 cx="${DEFECT_SCAN_CODEX:-codex}"
+# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
+# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
+# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
+detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
+labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
+if [ -n "$labels" ]; then
+  labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
+else
+  labelinstr="category = cat#1..cat#5 or a language-specific label"
+fi
 work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
 cp "$fixture" "$work/"                       # SOURCE only — never the .expected sidecar
 prompt="$(mktemp)"
@@ -12,9 +22,9 @@ prompt="$(mktemp)"
   echo "Run /defect-scan:scan on the file in this directory with --lang $lang."
   echo "After the normal report, append EXACTLY ONE machine block for the grader:"
   echo 'a line "<<<EVAL", then one line per finding as "<path>:<line>:<category>"'
-  echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a"
-  echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the"
-  echo "two sentinel lines with nothing between. Do not omit the block."
+  echo "(path = the file's basename; line = integer). $labelinstr"
+  echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with"
+  echo "nothing between. Do not omit the block."
 } > "$prompt"
 cd "$work"
 "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt"
diff --git a/tests/eval/rust/baseline.seen.txt b/tests/eval/rust/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/rust/baseline.seen.txt
+++ b/tests/eval/rust/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/shell/baseline.seen.txt b/tests/eval/shell/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/shell/baseline.seen.txt
+++ b/tests/eval/shell/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/swift/baseline.seen.txt b/tests/eval/swift/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/swift/baseline.seen.txt
+++ b/tests/eval/swift/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15
diff --git a/tests/eval/yaml/baseline.seen.txt b/tests/eval/yaml/baseline.seen.txt
index 02ca7d6..76dc77b 100644
--- a/tests/eval/yaml/baseline.seen.txt
+++ b/tests/eval/yaml/baseline.seen.txt
@@ -1,6 +1,6 @@
 precision_floor=0.80
 recall_floor=0.50
-precision_baseline=0.80
-recall_baseline=0.50
+precision_baseline=1.00
+recall_baseline=1.00
 noise_band=0.10
 overfit_band=0.15