Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/detect.bats
Original file line number Diff line number Diff line change
Expand Up @@ -1386,4 +1386,8 @@ EOF
grep -q "<<<EVAL" "$root/tests/eval/runners/codex.sh"
! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/claude.sh"
! grep -q "Follow eval-mode.md" "$root/tests/eval/runners/codex.sh"
# each runner must inject the language's valid label set (eval-categories) so the
# model emits grader-matchable labels instead of synonyms (panic vs unwrap-panic)
grep -q "eval-categories" "$root/tests/eval/runners/claude.sh"
grep -q "eval-categories" "$root/tests/eval/runners/codex.sh"
}
16 changes: 13 additions & 3 deletions tests/eval/runners/claude.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,25 @@
set -eu
fixture="${1:?claude.sh: need fixture path}"; lang="${2:?claude.sh: need lang}"
cc="${DEFECT_SCAN_CLAUDE:-claude}"
# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
if [ -n "$labels" ]; then
labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
else
labelinstr="category = cat#1..cat#5 or a language-specific label"
fi
work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar
cd "$work"
# Read-only: deny mutating tools so a runner can never edit the repo under test.
"$cc" -p "Run /defect-scan:scan $(basename "$fixture") --lang $lang.
After the normal report, append EXACTLY ONE machine block for the grader:
a line \"<<<EVAL\", then one line per finding as \"<path>:<line>:<category>\"
(path = the file's basename; line = integer; category = cat#1..cat#5 or a
language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the
two sentinel lines with nothing between. Do not omit the block." \
(path = the file's basename; line = integer). $labelinstr
Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with
nothing between. Do not omit the block." \
--permission-mode plan \
--disallowedTools "Edit,Write,NotebookEdit"
16 changes: 13 additions & 3 deletions tests/eval/runners/codex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,26 @@
set -eu
fixture="${1:?codex.sh: need fixture path}"; lang="${2:?codex.sh: need lang}"
cx="${DEFECT_SCAN_CODEX:-codex}"
# Resolve the engine + this language's valid label set BEFORE cd-ing into the temp dir.
# Telling the model the exact label vocabulary stops it inventing synonyms (panic vs
# unwrap-panic) that the exact-match grader can't score. eval-categories is read-only.
detect="$(CDPATH= cd "$(dirname "$0")/../../.." && pwd)/skills/scan/lib/detect.sh"
labels="$(sh "$detect" eval-categories "$lang" 2>/dev/null | tr '\n' ' ')"
if [ -n "$labels" ]; then
labelinstr="Use ONLY these category labels (exact strings, nothing else): $labels"
else
labelinstr="category = cat#1..cat#5 or a language-specific label"
fi
work="$(mktemp -d)"; trap 'rm -rf "$work"' EXIT
cp "$fixture" "$work/" # SOURCE only — never the .expected sidecar
prompt="$(mktemp)"
{
echo "Run /defect-scan:scan on the file in this directory with --lang $lang."
echo "After the normal report, append EXACTLY ONE machine block for the grader:"
echo 'a line "<<<EVAL", then one line per finding as "<path>:<line>:<category>"'
echo "(path = the file's basename; line = integer; category = cat#1..cat#5 or a"
echo "language-specific label), then a line \"EVAL>>>\". If you find nothing, emit the"
echo "two sentinel lines with nothing between. Do not omit the block."
echo "(path = the file's basename; line = integer). $labelinstr"
echo "Then a line \"EVAL>>>\". If you find nothing, emit the two sentinel lines with"
echo "nothing between. Do not omit the block."
} > "$prompt"
cd "$work"
"$cx" exec --sandbox read-only --skip-git-repo-check -o /dev/stdout - < "$prompt"
Expand Down
4 changes: 2 additions & 2 deletions tests/eval/rust/baseline.seen.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
precision_floor=0.80
recall_floor=0.50
precision_baseline=0.80
recall_baseline=0.50
precision_baseline=1.00
recall_baseline=1.00
noise_band=0.10
overfit_band=0.15
4 changes: 2 additions & 2 deletions tests/eval/shell/baseline.seen.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
precision_floor=0.80
recall_floor=0.50
precision_baseline=0.80
recall_baseline=0.50
precision_baseline=1.00
recall_baseline=1.00
noise_band=0.10
overfit_band=0.15
4 changes: 2 additions & 2 deletions tests/eval/swift/baseline.seen.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
precision_floor=0.80
recall_floor=0.50
precision_baseline=0.80
recall_baseline=0.50
precision_baseline=1.00
recall_baseline=1.00
noise_band=0.10
overfit_band=0.15
4 changes: 2 additions & 2 deletions tests/eval/yaml/baseline.seen.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
precision_floor=0.80
recall_floor=0.50
precision_baseline=0.80
recall_baseline=0.50
precision_baseline=1.00
recall_baseline=1.00
noise_band=0.10
overfit_band=0.15
Loading