From b456dbca33c4cd8981913d7e1cc1f6d60f0d6830 Mon Sep 17 00:00:00 2001 From: Eve McGivern Date: Tue, 16 Jun 2026 06:51:10 -0500 Subject: [PATCH 1/2] fix(eval): runners inject the language's valid label set (eval-categories) (#15) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval-mode prompt named 'a language-specific label' without enumerating the valid ones, so for custom-label languages the model invented synonyms (panic-> unwrap-panic, quoting->SC2086, coerce->norway-problem) that the exact-match grader scored as FP+FN — artificially depressing rust/shell/yaml to 0.67 and java held-out to 0.50 (a FALSE overfit flag). Runners now run 'detect.sh eval-categories ' and inject the exact valid set. Verified: rust unwrap now emits '3:panic' (was 'unwrap-panic'), an exact match. Co-Authored-By: Claude Opus 4.8 --- tests/detect.bats | 4 ++++ tests/eval/runners/claude.sh | 16 +++++++++++++--- tests/eval/runners/codex.sh | 16 +++++++++++++--- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tests/detect.bats b/tests/detect.bats index 561fc4e..ffd31ff 100644 --- a/tests/detect.bats +++ b/tests/detect.bats @@ -1386,4 +1386,8 @@ EOF grep -q "<</dev/null | tr '\n' ' ')" +if [ -n "$labels" ]; then + labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels" +else + labelinstr="category = cat#1..cat#5 or a language-specific label" +fi work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar cd "$work" @@ -12,8 +22,8 @@ cd "$work" "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang. After the normal report, append EXACTLY ONE machine block for the grader: a line \"<<::\" -(path = the file's basename; line = integer; category = cat#1..cat#5 or a -language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the -two sentinel lines with nothing between. Do not omit the block." \ +(path = the file's basename; line = integer). $labelinstr +Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with +nothing between. Do not omit the block." \ --permission-mode plan \ --disallowedTools "Edit,Write,NotebookEdit" diff --git a/tests/eval/runners/codex.sh b/tests/eval/runners/codex.sh index fc42b12..cb81d84 100755 --- a/tests/eval/runners/codex.sh +++ b/tests/eval/runners/codex.sh @@ -5,6 +5,16 @@ set -eu fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}" cx="${DEFECT_SCAN_CODEX:-codex}" +# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir. +# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs +# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only. +detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh" +labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')" +if [ -n "$labels" ]; then + labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels" +else + labelinstr="category = cat#1..cat#5 or a language-specific label" +fi work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar prompt="$(mktemp)" @@ -12,9 +22,9 @@ prompt="$(mktemp)" echo "Run /defect-scan:scan on the file in this directory with --lang $lang." echo "After the normal report, append EXACTLY ONE machine block for the grader:" echo 'a line "<<::"' - echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a" - echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the" - echo "two sentinel lines with nothing between. Do not omit the block." + echo "(path = the file's basename; line = integer). $labelinstr" + echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with" + echo "nothing between. Do not omit the block." } > "$prompt" cd "$work" "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt" From 6d047df2bb1b3a2095b53087bdc8f5af980d0ea3 Mon Sep 17 00:00:00 2001 From: Eve McGivern Date: Tue, 16 Jun 2026 08:24:31 -0500 Subject: [PATCH 2/2] chore(eval): record measured seen baselines for rust/shell/yaml/swift (#15) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-measured with the label-fixed runner: all four now 1.00/1.00 (±0.00). rust/shell/ yaml were the 0.67 synonym-label artifacts (now resolved by injecting eval-categories); swift's earlier PARTIAL transient cleared. Completes measured SEEN baselines for all 13 languages. java held-out (0.25, persistent overfit FLAG) intentionally NOT committed — fixture-quality issue tracked separately; it keeps its conservative placeholder. Co-Authored-By: Claude Opus 4.8 --- tests/eval/rust/baseline.seen.txt | 4 ++-- tests/eval/shell/baseline.seen.txt | 4 ++-- tests/eval/swift/baseline.seen.txt | 4 ++-- tests/eval/yaml/baseline.seen.txt | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/eval/rust/baseline.seen.txt b/tests/eval/rust/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/rust/baseline.seen.txt +++ b/tests/eval/rust/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/shell/baseline.seen.txt b/tests/eval/shell/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/shell/baseline.seen.txt +++ b/tests/eval/shell/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/swift/baseline.seen.txt b/tests/eval/swift/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/swift/baseline.seen.txt +++ b/tests/eval/swift/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/yaml/baseline.seen.txt b/tests/eval/yaml/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/yaml/baseline.seen.txt +++ b/tests/eval/yaml/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15