diff --git a/tests/detect.bats b/tests/detect.bats index 561fc4e..ffd31ff 100644 --- a/tests/detect.bats +++ b/tests/detect.bats @@ -1386,4 +1386,8 @@ EOF grep -q "<</dev/null | tr '\n' ' ')" +if [ -n "$labels" ]; then + labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels" +else + labelinstr="category = cat#1..cat#5 or a language-specific label" +fi work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar cd "$work" @@ -12,8 +22,8 @@ cd "$work" "$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang. After the normal report, append EXACTLY ONE machine block for the grader: a line \"<<::\" -(path = the file's basename; line = integer; category = cat#1..cat#5 or a -language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the -two sentinel lines with nothing between. Do not omit the block." \ +(path = the file's basename; line = integer). $labelinstr +Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with +nothing between. Do not omit the block." \ --permission-mode plan \ --disallowedTools "Edit,Write,NotebookEdit" diff --git a/tests/eval/runners/codex.sh b/tests/eval/runners/codex.sh index fc42b12..cb81d84 100755 --- a/tests/eval/runners/codex.sh +++ b/tests/eval/runners/codex.sh @@ -5,6 +5,16 @@ set -eu fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}" cx="${DEFECT_SCAN_CODEX:-codex}" +# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir. +# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs +# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only. +detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh" +labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')" +if [ -n "$labels" ]; then + labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels" +else + labelinstr="category = cat#1..cat#5 or a language-specific label" +fi work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar prompt="$(mktemp)" @@ -12,9 +22,9 @@ prompt="$(mktemp)" echo "Run /defect-scan:scan on the file in this directory with --lang $lang." echo "After the normal report, append EXACTLY ONE machine block for the grader:" echo 'a line "<<::"' - echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a" - echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the" - echo "two sentinel lines with nothing between. Do not omit the block." + echo "(path = the file's basename; line = integer). $labelinstr" + echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with" + echo "nothing between. Do not omit the block." } > "$prompt" cd "$work" "$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt" diff --git a/tests/eval/rust/baseline.seen.txt b/tests/eval/rust/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/rust/baseline.seen.txt +++ b/tests/eval/rust/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/shell/baseline.seen.txt b/tests/eval/shell/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/shell/baseline.seen.txt +++ b/tests/eval/shell/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/swift/baseline.seen.txt b/tests/eval/swift/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/swift/baseline.seen.txt +++ b/tests/eval/swift/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15 diff --git a/tests/eval/yaml/baseline.seen.txt b/tests/eval/yaml/baseline.seen.txt index 02ca7d6..76dc77b 100644 --- a/tests/eval/yaml/baseline.seen.txt +++ b/tests/eval/yaml/baseline.seen.txt @@ -1,6 +1,6 @@ precision_floor=0.80 recall_floor=0.50 -precision_baseline=0.80 -recall_baseline=0.50 +precision_baseline=1.00 +recall_baseline=1.00 noise_band=0.10 overfit_band=0.15