From 5fdf012acbf7f3e3713b6c6693a8c1c6e90a5e48 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 13 Jun 2026 11:19:16 -0600 Subject: [PATCH] =?UTF-8?q?feat(evals):=20adopt=20thulr=200.1.3=20?= =?UTF-8?q?=E2=80=94=20duel,=20review=20calibration,=20pareto,=20named=20c?= =?UTF-8?q?riteria?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - eval:compare --pairwise runs thulr's position-swapped `duel` (relative win-rate judging; flips = judge position bias) over one self-contained trace per arm, replacing the hand-rolled in-process pairwise judge - human-review calibration: `eval:review` records SME verdicts; `eval` auto-folds them into `calibrate --reviews` (judge-vs-human TPR/TNR) - `eval:pareto` ranks failure modes across stored traces (free, no judge calls) - multi-dimension named criteria (thulr.criteria.) on the review cases (evidence_quality, impact_explanation) for score headroom; opt-in --score-guardrail= - eval:compare: configurable --timeout (subject + judge agents) and comma-separated --filter; judge.mjs no longer hardcodes a 120s judge cap - broaden the session-cache existence-check scorer (fixes a confirmed false negative where the judge was right and the regex missed the phrasing) - bridge: duelArgs/paretoArgs/reviewArgs/formatDuelSummary + tests; shared arg parser; 0.1.2 -> 0.1.3 docs/stamps evals/ is not packaged (.npmignore), so this is dev-tooling only — no version bump. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 10 +++ evals/README.md | 80 ++++++++++++++++--- evals/args.mjs | 27 +++++++ evals/cases.mjs | 20 ++++- evals/compare.mjs | 152 +++++++++++++++++++++++++------------ evals/judge.mjs | 47 +----------- evals/pareto.mjs | 45 +++++++++++ evals/review.mjs | 56 ++++++++++++++ evals/run.mjs | 31 ++++++-- evals/thulr.mjs | 144 +++++++++++++++++++++++++++++++++-- package.json | 2 + tests/eval-args.test.ts | 23 ++++++ tests/thulr-bridge.test.ts | 83 +++++++++++++++++++- 13 files changed, 600 insertions(+), 120 deletions(-) create mode 100644 evals/args.mjs create mode 100644 evals/pareto.mjs create mode 100644 evals/review.mjs create mode 100644 tests/eval-args.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index d5cf69e..2f722fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,16 @@ that must agree are `package.json`, `PI_FLOWS_VERSION` in summary now prints thulr's numeric score, pass-rate, and efficiency deltas from `thulr gate --json` before the human gate report, and `--noise-band=` makes guardrail tolerance explicit. +- Evals: adopt thulr 0.1.3. `npm run eval:compare -- --pairwise` now runs thulr's + calibrated, position-swapped **`duel`** (relative win-rate judging, flips reported + as judge position bias) over one self-contained trace per arm, replacing the + harness's hand-rolled in-process pairwise judge. `npm run eval:review` records + human SME verdicts and `npm run eval` folds them into calibration as a second + ground-truth axis (`--reviews`; judge-vs-human TPR/TNR), auto-discovering + `.thulr/reviews/.reviews.json`. `npm run eval:pareto` ranks failure modes + across stored traces (which failure on which prompt/config version to fix first). + Calibration also surfaces thulr 0.1.3's judge-trust gate: a judge blind in either + direction downgrades a clean gate PASS to WARN. - Vote/orchestrate quality: same-agent/model voters now receive complementary stances so ballots are not identical prompt replays, and orchestrate workers now see the overall goal/contract alongside their assigned subtask before diff --git a/evals/README.md b/evals/README.md index 5e63ad6..e7f5a2c 100644 --- a/evals/README.md +++ b/evals/README.md @@ -38,6 +38,7 @@ npm run eval -- --judge-model=anthropic/claude-opus-4-8 # thulr judge model (d npm run eval -- --judge-bin=/path/to/judge-wrapper # override thulr's judge command npm run eval -- --samples=3 # judge each case 3×: majority verdict, mean score, flake warnings (3× judge spend) npm run eval -- --eval-set=.thulr/eval-sets/release.json # overlay promoted criteria / guardrail authority +npm run eval -- --reviews=.thulr/reviews/thulr-trace.reviews.json # fold human SME verdicts into calibration (judge-vs-human TPR/TNR) npm run eval -- --efficiency-guardrail=cost_usd --efficiency-guardrail=tokens # fail on spend/token regressions npm run eval -- --noise-band=0.10 # regression tolerance for score/pass-rate/efficiency guardrails (default 0.05) npm run eval -- --cap=1.00 # per-case USD ceiling on flow delegations (default 0.50) @@ -76,7 +77,7 @@ verifies the binary, workspace, store, and that thulr's judge binary `pi` resolv 3. `thulr label-failures --trace ` applies thulr's failure-mode ontology and writes labels for calibration/triage. 4. `thulr judge --trace ` grades each case's answer against its inline - `criterion` → an EvalRun. thulr (0.1.2) reads everything from the trace — no + `criterion` → an EvalRun. thulr (0.1.3) reads everything from the trace — no separate cases-manifest or labels files. With `--samples=N` each case is judged N times and aggregated (majority verdict, ties fail safe; mean score) — the EvalRun's `score_stddev` then reports **judge noise** instead of cross-case @@ -86,6 +87,12 @@ verifies the binary, workspace, store, and that thulr's judge binary `pi` resolv — how well the judge's verdicts track the inline deterministic labels, with failure labels included in the report. (An uncalibrated judge can silently certify regressions; this is the calibration the old single-judge setup lacked.) + Record human SME verdicts with `npm run eval:review` and the harness folds them + in as a second ground-truth axis (`--reviews`; judge-vs-human TPR/TNR) — see + [Human review & failure triage](#human-review--failure-triage). thulr 0.1.3 also + queues every judge/ground-truth disagreement onto `thulr queue` and feeds this + calibration into the gate: a judge blind in either direction (TPR or TNR 0% over + labeled cases) downgrades a clean PASS to WARN with the dimension named. 6. Before gating, pi-flows writes `.thulr/runs/candidate.gate.json`, which is the judged EvalRun with calibration canaries filtered out and summaries recomputed. `thulr gate` compares that gate candidate to @@ -224,7 +231,7 @@ and the same cross-model judge: ```bash npm run eval:compare # all cases, both arms -npm run eval:compare -- --pairwise # add order-controlled pairwise judging (the sensitive metric) +npm run eval:compare -- --pairwise # add thulr's relative duel (the sensitive metric) npm run eval:compare -- --filter=vote # scope to keep cost down (runs both arms per case) npm run eval:compare -- --write=evals/compare.json npm run eval:compare -- --dry-run # wiring smoke, no model @@ -235,14 +242,50 @@ PI_FLOWS_TRACE_FILE=/tmp/ab.jsonl npm run eval:compare -- --pairwise --write=eva npm run trace:report -- /tmp/ab.jsonl ``` -`eval:compare` keeps its own **order-controlled pairwise** judge (run twice with -positions swapped, scored a win only when both orderings agree, told *not* to -reward length) — the sensitive head-to-head metric for small gaps that thulr's -absolute per-dimension scoring can't resolve. A few objective checks are -pi-flows-only by construction (route dispatch, the same-model vote warning); plain -pi can't satisfy them, so read those as *capabilities flows adds*, not plain losses. -Give a case a `baselinePrompt` when its flow params encode goal info outside `task` -(e.g. a return contract) so the plain arm is graded on the same goal. +With `--pairwise` the harness emits one self-contained trace per arm and shells out +to **`thulr duel`** (0.1.3) — thulr's calibrated relative judge. It pairs the arms +by case id, judges each shared case **twice with the answers swapped**, and counts a +win only when both orderings agree; opposite preferences are a **flip** (judge +position bias), reported as judge noise and excluded from the win rate. This is the +sensitive head-to-head metric for small gaps that thulr's absolute per-dimension +scoring can't resolve — and it replaces the harness's old in-process pairwise judge. +The duel spends two judge calls per eligible case (both arms must have reached the +model) and persists a `thulr.duel_report.v1` at `.thulr/runs/compare-duel.json`. A +few objective checks are pi-flows-only by construction (route dispatch, the +same-model vote warning); plain pi can't satisfy them, so read those as *capabilities +flows adds*, not plain losses. Give a case a `baselinePrompt` when its flow params +encode goal info outside `task` (e.g. a return contract) so the plain arm is graded +on the same goal. + +## Human review & failure triage + +Two free (no judge tokens) thulr 0.1.3 workflows close the loop on judged runs. + +**Record human verdicts** so calibration measures the judge against a person, not +only the deterministic labels: + +```bash +npm run eval:review -- --list # reviewed / unreviewed case ids for the last trace +npm run eval:review -- --case single-answer-quality-judged --verdict pass +npm run eval:review -- --case route-classifies-bug-to-recon --verdict fail \ + --failure-mode routing.wrong_agent --note "should have gone to recon" +``` + +Verdicts land in `.thulr/reviews/thulr-trace.reviews.json` — the path the next +`npm run eval` auto-discovers — so a recorded verdict needs no flag on the next run. +`calibrate` then reports a **human** section (judge-vs-human TPR/TNR), and human +verdicts take precedence over auto labels for the cases they cover. Point at an +explicit set with `npm run eval -- --reviews=`. + +**Rank failure modes across every stored trace** — which failure on which prompt or +config version to fix first, joining deterministic labels, human reviews, and stored +EvalRun scores: + +```bash +npm run eval:pareto # rank by prompt version over evals/thulr-trace.jsonl +npm run eval:pareto -- --by=config-version # split by subject config instead +npm run eval:pareto -- --limit=10 # top N rows +``` ## Experiments: champion/challenger (and the optimizer) @@ -302,6 +345,9 @@ Append to `cases.mjs`: params: { agent: "recon", task: "…" }, // the flow tool input cwd: "/optional/working/dir", criterion: "One strict, literal statement a correct answer must satisfy.", // graded by thulr's judge + namedCriteria: { // optional: extra judge dimensions (0.1.3) + evidence_quality: "Each claim cites the specific code it refers to.", + }, score(result, ctx) { // objective, deterministic check const ok = /expected/.test(result.content[0].text); return { pass: ok, score: ok ? 1 : 0, notes: "…" }; @@ -316,6 +362,15 @@ single literal statement of what a correct answer must say; thulr grades the ans text against it on a different vendor than the subject. Always provide a `mock` so `--dry-run` can exercise the runner — and the artifact emission — offline. +**Named criteria (`namedCriteria`)** add thulr 0.1.3 multi-dimension judging: each +`{ dimension: "criterion text" }` entry is emitted as `thulr.criteria.` +on the graded span and judged into **its own dimension** alongside the required +`criterion` — with its own pass-rate, score delta, and calibration. Use them for +*orthogonal* quality axes (e.g. `evidence_quality`, `impact_explanation`) so a +near-saturated case still produces a gradient. Dimension names must be non-empty, +whitespace-free, and not `criterion`. They are observed by default; gate one with +`--score-guardrail=` once it looks stable. + ### Hard cases (`hard: true`) For **score-tracked** cases — ones that intentionally land mid-scale so a better @@ -327,7 +382,10 @@ the run to be green — only a regression in their mean score blocks. Keep the ` a *complete* answer so `--dry-run` stays green. See `review-finds-all-webhook-defects` (4 defects) and `review-finds-session-cache-defects` (3 defects) — multi-defect code reviews where a typical pass misses the subtler ones (signature verification, TTL -validation), so a sharper prompt has room to climb. +validation), so a sharper prompt has room to climb. Both also carry `namedCriteria` +(`evidence_quality`, `impact_explanation`) so the judge grades *how well* each defect +is explained, not just whether all were found — extra headroom on cases that would +otherwise saturate at "found them all." A *frontier* subject model exhausts these small fixtures (it finds every defect), so the score pins at 1.0 with no headroom. Rather than pin a different model per case, diff --git a/evals/args.mjs b/evals/args.mjs new file mode 100644 index 0000000..9bea569 --- /dev/null +++ b/evals/args.mjs @@ -0,0 +1,27 @@ +// Tiny argv parser for the eval CLI wrappers (review.mjs, pareto.mjs). Accepts both +// `--name value` (the style thulr's own CLI uses) and `--name=value` (the style the +// rest of the harness uses), plus bare boolean flags (`--list`, `--json`). Returns a +// plain object keyed by flag name. A token that itself starts with `--` is never +// consumed as a value, so a bare flag immediately before another flag stays boolean. +// Repeated flags keep the last value; positionals are ignored. +export function parseArgs(argv) { + const opts = {}; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (!a.startsWith("--")) continue; + const eq = a.indexOf("="); + if (eq !== -1) { + opts[a.slice(2, eq)] = a.slice(eq + 1); + continue; + } + const name = a.slice(2); + const next = argv[i + 1]; + if (next !== undefined && !next.startsWith("--")) { + opts[name] = next; + i += 1; + } else { + opts[name] = true; + } + } + return opts; +} diff --git a/evals/cases.mjs b/evals/cases.mjs index 3a47126..eed5839 100644 --- a/evals/cases.mjs +++ b/evals/cases.mjs @@ -148,6 +148,13 @@ export const CASES = [ cwd: fixturesRepo, baselinePrompt: "Review billing-webhook.js for ALL production-correctness defects, not just the most obvious one. Name each distinct defect and why it matters.", criterion: "The review identifies ALL FOUR distinct defects: (1) recordPayment references `ledger`, which is never declared/initialized, so every call throws a ReferenceError (500); (2) no idempotency/deduplication, so a duplicate or retried delivery double-counts the payment; (3) no verification of the webhook's signature/authenticity, so a forged request is accepted as a real payment; (4) no input validation or error handling, so a malformed `req.body.data.object` throws unhandled and 500s. Fewer than four is incomplete.", + // Orthogonal quality dimensions (0.1.3 multi-dimension judging) — graded + // alongside the completeness `criterion` above so a review that names defects + // but explains them shallowly scores lower here, giving the suite headroom. + namedCriteria: { + evidence_quality: "Every defect named is pinned to the specific code that causes it (e.g. `recordPayment`/the undeclared `ledger`, `req.body.data.object`, the missing signature check) rather than described in vague or generic terms.", + impact_explanation: "Every defect states its concrete production impact (e.g. a ReferenceError 500 on every call, double-counted payments on a retried webhook delivery, a forged request accepted as a real payment, an unhandled 500 on a malformed body), not merely that something is wrong.", + }, score(r) { const body = text(r); const ledger = /ledger/i.test(body) && /(never (declared|defined|initiali)|undeclared|undefined|referenceerror|not (declared|defined|initiali))/i.test(body); @@ -166,9 +173,20 @@ export const CASES = [ cwd: fixturesRepo, baselinePrompt: "Review session-cache.js for ALL correctness and reliability defects, not just the most obvious one. Name each distinct defect and why it matters.", criterion: "The review identifies ALL THREE distinct defects: (1) getSession reads `entry.expiresAt` without checking the id exists, so an unknown/missing id dereferences `undefined` and throws a TypeError; (2) expired entries are never evicted (getSession returns null but leaves them), so the store grows unbounded — a memory leak; (3) ttlSeconds is never validated, so a missing, NaN, or negative TTL produces a broken/garbage expiry. Fewer than three is incomplete.", + // Orthogonal quality dimensions (0.1.3 multi-dimension judging) — graded + // alongside the completeness `criterion` above so a shallow-but-complete + // review scores lower here, giving the suite headroom. + namedCriteria: { + evidence_quality: "Every defect named is pinned to the specific code that causes it (e.g. `getSession` dereferencing `entry.expiresAt`, the never-evicted `sessions` entries, the unvalidated `ttlSeconds`) rather than described in vague or generic terms.", + impact_explanation: "Every defect states its concrete impact (e.g. a TypeError when the id is unknown, unbounded memory growth from expired entries never being evicted, a broken expiry from a missing/NaN/negative TTL), not merely that something is wrong.", + }, score(r) { const body = text(r); - const existence = /(entry|session|id)[^.]{0,40}(undefined|missing|absent|does(n'?t| not) exist|not (found|present|exist)|no[^.]{0,10}(existence|null|presence) check)|throws?[^.]{0,30}(unknown|missing|absent|undefined|no .{0,8}(id|session|entry))|typeerror|crash[^.]{0,20}(missing|unknown|absent|undefined)/i.test(body); + // Broadened after a real false negative: the model wrote "without a miss + // guard / unknown id throws / cache miss can crash", none of which the old + // pattern matched. Match the concept (a missing/unknown id or cache miss + // dereferences/throws, or a missing existence guard), not one phrasing. + const existence = /\btypeerror\b|(unknown|missing|absent|non-?existent|invalid|unrecogni[sz]ed)[^.]{0,30}\b(id|key|entry|session|lookup)\b|\bcache[- ]?miss\b|(entry|session|getsession)[^.]{0,40}(undefined|null|throw|crash|deref|not[^.]{0,8}(exist|found|present))|(no|missing|without|lacks?|add|needs?)[^.]{0,25}(existence|presence|null|miss|nil)?[- ]?(guard|check)|\bmiss[- ]?guard\b/i.test(body); const leak = /memory leak|never (evict|delet|remov|clean|free|purg)|unbounded|grow[^.]{0,16}(forever|unbounded|indefinit|without bound)|not[^.]{0,8}(evict|delet|remov|clean|purg)|\bleak/i.test(body); const ttl = /ttlseconds|\bttl\b/i.test(body) && /validat|negativ|\bnan\b|invalid|unchecked|non-numeric|immortal|never expir/i.test(body); const found = [existence && "no-existence-check", leak && "memory-leak", ttl && "no-ttl-validation"].filter(Boolean); diff --git a/evals/compare.mjs b/evals/compare.mjs index 81415b8..8cfc9eb 100644 --- a/evals/compare.mjs +++ b/evals/compare.mjs @@ -7,8 +7,9 @@ // Then reports per-case and aggregate deltas. // // npm run eval:compare # all cases, both arms, on your pi default -// npm run eval:compare -- --pairwise # add order-controlled pairwise judging (the sensitive metric) -// npm run eval:compare -- --filter=vote # scope to keep cost down +// npm run eval:compare -- --pairwise # add thulr's relative duel (the sensitive metric) +// npm run eval:compare -- --filter=vote,route # scope to a comma-separated set of name substrings +// npm run eval:compare -- --timeout=300000 # per-agent ms cap (default 120000) for heavy review/evaluate cases // npm run eval:compare -- --model=openai-codex/gpt-5.5 --judge-model=anthropic/claude-sonnet-4-6 // npm run eval:compare -- --write=evals/compare.json // npm run eval:compare -- --dry-run # wiring smoke (canned results, no model) @@ -17,19 +18,22 @@ // the flows arm (diagnose WHY an arm scored as it did) — the flow tool honors that // env var, no flag needed. // -// Absolute judge scores cluster and can't resolve small gaps; --pairwise shows the -// judge both answers and asks which is better (positions swapped to cancel order -// bias), which is the sensitive, fair head-to-head. Some objective checks are -// pi-flows-only by construction (route dispatch, the same-model vote warning) and -// plain pi cannot satisfy them — that gap IS the point for those cases. +// Absolute judge scores cluster and can't resolve small gaps. With --pairwise the +// harness emits one self-contained trace per arm and shells out to `thulr duel`, +// thulr's calibrated relative judge: it pairs the arms by case id, judges each case +// twice with the answers swapped to cancel order bias, and counts a win only when +// both orderings agree (a flip is reported as judge position bias, not a win). This +// replaces the old in-process pairwise judge. Some objective checks are pi-flows-only +// by construction (route dispatch, the same-model vote warning) and plain pi cannot +// satisfy them — that gap IS the point for those cases. import { execFileSync } from "node:child_process"; -import { existsSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, writeFileSync } from "node:fs"; import { join, resolve } from "node:path"; import { runPlainPi } from "./baseline-pi.mjs"; import { CASES } from "./cases.mjs"; -import { judgePairwise } from "./judge.mjs"; import { answerText, caseCwd, flowTool, scoreArm, DEFAULT_EVAL_MODEL } from "./lib.mjs"; import { injectModel } from "./model-injection.mjs"; +import * as thulr from "./thulr.mjs"; const dotenvPath = join(process.cwd(), ".env"); if (existsSync(dotenvPath)) { @@ -48,10 +52,29 @@ const useAgentModels = ["agent", "default", ""].includes(model); const subjectModel = useAgentModels ? undefined : model; const judgeModel = flag("judge-model", null) ?? process.env.PI_FLOWS_JUDGE_MODEL ?? "anthropic/claude-haiku-4-5"; const capUsd = Number(flag("cap", "1.00")); +// Per-agent timeout (ms) for BOTH the subject arms and the cross-model judge. Default +// 120s; raise it for the heavy review/evaluate cases whose arms (or the judge grading +// a long answer) legitimately run minutes — a too-low cap surfaces as ⚠ child timeout +// and drops the case from the duel. Override: --timeout=300000 or PI_FLOWS_TIMEOUT_MS. +const timeoutMs = Number(flag("timeout", process.env.PI_FLOWS_TIMEOUT_MS ?? "120000")); const dryRun = args.includes("--dry-run"); const pairwise = args.includes("--pairwise"); +// --filter is a comma-separated set of name substrings; a case matches if it contains ANY term. const filter = flag("filter", ""); +const filterTerms = filter.split(",").map((t) => t.trim()).filter(Boolean); const writeArtifact = flag("write", ""); +// Same judge-bin resolution as the main harness: the committed wrapper keeps +// extension-provided model providers available to thulr's judge (and duel) calls. +const defaultJudgeBin = "scripts/thulr-judge-pi.sh"; +const configuredJudgeBin = flag("judge-bin", null) ?? process.env.THULR_JUDGE_BIN ?? null; +const judgeBin = configuredJudgeBin ?? (existsSync(resolve(process.cwd(), defaultJudgeBin)) ? defaultJudgeBin : null); + +// Regenerated duel artifacts (under the gitignored .thulr/ store): one trace per arm +// plus the persisted thulr.duel_report.v1. +const RUNS_DIR = resolve(process.cwd(), ".thulr/runs"); +const FLOWS_TRACE = join(RUNS_DIR, "compare-flows.jsonl"); +const PLAIN_TRACE = join(RUNS_DIR, "compare-plain.jsonl"); +const DUEL_OUT = join(RUNS_DIR, "compare-duel.json"); function preflight() { if (dryRun) return true; @@ -72,7 +95,7 @@ async function runArm(kind, testCase, flow, signal) { const cwd = caseCwd(testCase, { dryRun }); const flowCtx = { cwd, hasUI: false, ui: { confirm: async () => true, notify: () => undefined } }; const ctx = { flow, model: subjectModel, dryRun, flowCtx }; - const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx, maxCostUsd: capUsd }; + const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx, maxCostUsd: capUsd, timeoutMs }; const startedAt = Date.now(); let result; @@ -80,7 +103,7 @@ async function runArm(kind, testCase, flow, signal) { if (dryRun) { result = testCase.mock; } else if (kind === "flows") { - const params = { ...(useAgentModels ? structuredClone(testCase.params) : injectModel(testCase.params, model)), traceLabel: testCase.name, maxCostUsd: testCase.params.maxCostUsd ?? capUsd, timeoutMs: testCase.params.timeoutMs ?? 120000 }; + const params = { ...(useAgentModels ? structuredClone(testCase.params) : injectModel(testCase.params, model)), traceLabel: testCase.name, maxCostUsd: testCase.params.maxCostUsd ?? capUsd, timeoutMs: testCase.params.timeoutMs ?? timeoutMs }; try { result = await flow.execute(`cmp:flows:${testCase.name}`, params, signal, undefined, flowCtx); } catch (error) { @@ -90,7 +113,7 @@ async function runArm(kind, testCase, flow, signal) { // Plain arm: the raw task a user would type to pi with no flows installed. const task = testCase.baselinePrompt ?? testCase.params.task; try { - result = await runPlainPi({ task, cwd, model: subjectModel, timeoutMs: 120000, signal }); + result = await runPlainPi({ task, cwd, model: subjectModel, timeoutMs, signal }); } catch (error) { thrown = error; } @@ -100,19 +123,21 @@ async function runArm(kind, testCase, flow, signal) { return { ...arm, durationMs: Date.now() - startedAt, answer: answerText(result) }; } -// Order-bias-controlled pairwise verdict: judge twice with the answers swapped, and -// only call a winner when BOTH orderings agree — otherwise it's a tie (a flip means -// the judge is keying on position, not content). -async function pairwiseVerdict(judgeCtx, { criteria, flowsAnswer, plainAnswer }) { - const c1 = await judgePairwise(judgeCtx, { criteria, answerA: flowsAnswer, answerB: plainAnswer }); - const c2 = await judgePairwise(judgeCtx, { criteria, answerA: plainAnswer, answerB: flowsAnswer }); - const cost = (c1.cost ?? 0) + (c2.cost ?? 0); - const infra = c1.infra ?? c2.infra ?? null; - if (infra) return { winner: "error", v1: "error", v2: "error", cost, infra }; - const v1 = c1.winner === "TIE" ? "tie" : c1.winner === "A" ? "flows" : "plain"; // flows was A - const v2 = c2.winner === "TIE" ? "tie" : c2.winner === "A" ? "plain" : "flows"; // flows was B - const winner = v1 === "flows" && v2 === "flows" ? "flows" : v1 === "plain" && v2 === "plain" ? "plain" : "tie"; - return { winner, v1, v2, cost, infra: null }; +// Emit a self-contained thulr trace for one arm: one case span per eligible row, +// carrying the same case id and criterion in both arms so `thulr duel` can pair +// them. `pick` selects the arm (flows/plain) whose answer this trace grades. +function emitArmTrace(file, eligibleRows, pick) { + thulr.startTrace(file); + for (const r of eligibleRows) { + thulr.appendCaseSpans(file, { + name: r.name, + answer: pick(r).answer, + criterion: r.criterion, + task: r.task, + model: subjectModel, + endMs: Date.now(), + }); + } } function armLine(label, arm) { @@ -122,10 +147,18 @@ function armLine(label, arm) { const pickArm = (a) => ({ judgePass: a.judged.pass, judgeScore: a.judged.score, objPass: a.objective.pass, objScore: a.objective.score, cost: a.cost, durationMs: a.durationMs, infra: a.reachedModel ?? null, answer: (a.answer ?? "").slice(0, 1000) }); +const duelLabel = (outcome) => ( + outcome === "wina" ? "▲ flows" + : outcome === "winb" ? "▼ plain" + : outcome === "flip" ? "⚠ flip (judge position bias)" + : outcome === "tie" ? "= tie" + : "— (not judged)" +); + async function main() { if (!preflight()) process.exit(2); - const selected = CASES.filter((c) => !filter || c.name.includes(filter)); + const selected = CASES.filter((c) => filterTerms.length === 0 || filterTerms.some((t) => c.name.includes(t))); if (selected.length === 0) { console.error(`No cases match --filter=${filter}. Available: ${CASES.map((c) => c.name).join(", ")}`); process.exit(2); @@ -134,21 +167,17 @@ async function main() { const flow = flowTool(); const signal = new AbortController().signal; const trace = process.env.PI_FLOWS_TRACE_FILE ? ` · trace ${process.env.PI_FLOWS_TRACE_FILE}` : ""; - console.log(`pi-flows A/B (flows vs plain pi) · subject ${useAgentModels ? "(agent frontmatter)" : model} · judge ${dryRun ? "(skipped)" : judgeModel}${pairwise ? " +pairwise" : ""} · cap $${capUsd.toFixed(2)}/case${trace}${dryRun ? " · DRY RUN" : ""}\n`); + console.log(`pi-flows A/B (flows vs plain pi) · subject ${useAgentModels ? "(agent frontmatter)" : model} · judge ${dryRun ? "(skipped)" : judgeModel}${pairwise ? " +duel" : ""} · cap $${capUsd.toFixed(2)}/case · timeout ${Math.round(timeoutMs / 1000)}s/agent${trace}${dryRun ? " · DRY RUN" : ""}\n`); const rows = []; - let pairwiseCost = 0; for (const testCase of selected) { const flows = await runArm("flows", testCase, flow, signal); const plain = await runArm("plain", testCase, flow, signal); - let pv = null; - if (pairwise && !flows.reachedModel && !plain.reachedModel && testCase.criterion) { - const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx: { cwd: process.cwd(), hasUI: false, ui: { confirm: async () => true, notify: () => undefined } }, maxCostUsd: capUsd }; - pv = await pairwiseVerdict(judgeCtx, { criteria: testCase.criterion, flowsAnswer: flows.answer, plainAnswer: plain.answer }); - pairwiseCost += pv.cost ?? 0; - } - rows.push({ name: testCase.name, flows, plain, pv }); + // Eligible for the duel only when BOTH arms reached the model (an infra miss + // has no real answer to compare) and the case states a criterion to judge. + const duelEligible = !flows.reachedModel && !plain.reachedModel && Boolean(testCase.criterion); + rows.push({ name: testCase.name, criterion: testCase.criterion, task: testCase.baselinePrompt ?? testCase.params.task, flows, plain, duelEligible, outcome: null }); const dj = (flows.judged.score ?? 0) - (plain.judged.score ?? 0); const arrow = dj > 0.001 ? "▲ flows" : dj < -0.001 ? "▼ plain" : "= tie"; @@ -156,13 +185,33 @@ async function main() { console.log(armLine("flows", flows)); console.log(armLine("plain", plain)); console.log(` judge Δ ${fixed(dj)} ${arrow}`); - if (pv) { - const label = pv.winner === "flows" ? "▲ flows" : pv.winner === "plain" ? "▼ plain" : pv.winner === "error" ? `⚠ ${pv.infra}` : "= tie"; - console.log(` pairwise ${label} (swap: ${pv.v1}, ${pv.v2})`); - } console.log(""); } + // Relative head-to-head via thulr's calibrated, position-swapped duel. One + // `thulr duel` over the two arm traces does every swap-controlled comparison at + // once (paired by case id), replacing the harness's old in-process pairwise judge. + let duelReport = null; + if (pairwise && !dryRun) { + const eligible = rows.filter((r) => r.duelEligible); + if (!thulr.available()) { + console.log("⚠ --pairwise needs the `thulr` CLI for the duel (relative judging); it was not found on PATH.\n Install it (e.g. `cargo install thulr`) or drop --pairwise. The absolute deltas above still stand.\n"); + } else if (eligible.length === 0) { + console.log("⚠ --pairwise: no case had both arms reach the model, so there is nothing to duel.\n"); + } else { + mkdirSync(RUNS_DIR, { recursive: true }); + emitArmTrace(FLOWS_TRACE, eligible, (r) => r.flows); + emitArmTrace(PLAIN_TRACE, eligible, (r) => r.plain); + try { + duelReport = thulr.duel({ traceA: FLOWS_TRACE, traceB: PLAIN_TRACE, labelA: "flows", labelB: "plain", model: judgeModel, out: DUEL_OUT, judgeBin, json: true }); + const outcomeByCase = new Map((duelReport.cases ?? []).map((c) => [c.case_id, c.outcome])); + for (const r of rows) r.outcome = outcomeByCase.get(r.name) ?? null; + } catch (error) { + console.log(`⚠ thulr duel failed: ${error?.message ?? error}\n`); + } + } + } + const fJudge = mean(rows.map((r) => r.flows.judged.score ?? 0)); const pJudge = mean(rows.map((r) => r.plain.judged.score ?? 0)); const fCrit = rows.filter((r) => r.flows.judged.pass).length; @@ -175,20 +224,29 @@ async function main() { const losses = rows.filter((r) => (r.plain.judged.score ?? 0) - (r.flows.judged.score ?? 0) > 0.001).length; console.log(`Summary over ${rows.length} case${rows.length === 1 ? "" : "s"}`); - if (pairwise) { - const pw = (w) => rows.filter((r) => r.pv?.winner === w).length; - console.log(` pairwise flows wins ${pw("flows")} · plain wins ${pw("plain")} · ties ${pw("tie")}${pw("error") ? ` · errors ${pw("error")}` : ""} (order-controlled — the sensitive metric)`); - } console.log(` abs judge pass flows ${fCrit}/${rows.length} (${pct(fCrit, rows.length)}) plain ${pCrit}/${rows.length} (${pct(pCrit, rows.length)})`); - console.log(` abs mean judge flows ${fJudge.toFixed(2)} plain ${pJudge.toFixed(2)} lift ${fixed(fJudge - pJudge)} (low resolution — read pairwise instead)`); + console.log(` abs mean judge flows ${fJudge.toFixed(2)} plain ${pJudge.toFixed(2)} lift ${fixed(fJudge - pJudge)} (low resolution — read the duel instead)`); console.log(` abs per-case flows wins ${wins} · plain wins ${losses} · ties ${rows.length - wins - losses}`); - console.log(` cost flows $${fCost.toFixed(4)} plain $${pCost.toFixed(4)} (${pCost > 0 ? `${(fCost / pCost).toFixed(1)}× more` : "n/a"})${pairwise ? ` · pairwise judging $${pairwiseCost.toFixed(4)}` : ""}`); + console.log(` cost flows $${fCost.toFixed(4)} plain $${pCost.toFixed(4)} (${pCost > 0 ? `${(fCost / pCost).toFixed(1)}× more` : "n/a"})`); console.log(` wall-clock flows ${fSec.toFixed(0)}s plain ${pSec.toFixed(0)}s`); - console.log("\nNote: pairwise (same criterion, cross-model judge, told not to reward length) is the fair head-to-head. Some objective checks are pi-flows-only (route dispatch, same-model vote warning); plain pi cannot satisfy them by design, so read those as capabilities flows adds, not plain losses."); + + if (pairwise && duelReport) { + console.log("\nPairwise duel (thulr · order-controlled relative judging — the sensitive metric)"); + for (const r of rows) { + if (!r.duelEligible) { + console.log(` ${r.name.padEnd(34)} — (skipped: an arm did not reach the model)`); + continue; + } + console.log(` ${r.name.padEnd(34)} ${duelLabel(r.outcome)}`); + } + for (const line of thulr.formatDuelSummary(duelReport)) console.log(` ${line}`); + } + + console.log("\nNote: the thulr duel (same criterion, cross-model judge, positions swapped, told not to reward length) is the fair head-to-head; a flip means the judge keyed on position, not content. Some objective checks are pi-flows-only (route dispatch, same-model vote warning); plain pi cannot satisfy them by design, so read those as capabilities flows adds, not plain losses."); if (writeArtifact && !dryRun) { const out = resolve(process.cwd(), writeArtifact); - writeFileSync(out, `${JSON.stringify({ model: useAgentModels ? "agent" : model, judgeModel, capUsd, pairwise, rows: rows.map((r) => ({ name: r.name, pairwise: r.pv?.winner ?? null, pairwiseSwap: r.pv ? [r.pv.v1, r.pv.v2] : null, flows: pickArm(r.flows), plain: pickArm(r.plain) })) }, null, 2)}\n`, "utf8"); + writeFileSync(out, `${JSON.stringify({ model: useAgentModels ? "agent" : model, judgeModel, capUsd, pairwise, duel: duelReport?.summary ?? null, rows: rows.map((r) => ({ name: r.name, duel: r.outcome, flows: pickArm(r.flows), plain: pickArm(r.plain) })) }, null, 2)}\n`, "utf8"); console.log(`\nWrote comparison: ${out}`); } diff --git a/evals/judge.mjs b/evals/judge.mjs index 5d3ae4e..7557a8c 100644 --- a/evals/judge.mjs +++ b/evals/judge.mjs @@ -45,7 +45,7 @@ export async function judge(ctx, { criteria, answer }) { const result = await ctx.flow.execute( "eval:judge", - { agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: 120000 }, + { agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: ctx.timeoutMs ?? 120000 }, new AbortController().signal, undefined, ctx.flowCtx, @@ -66,48 +66,3 @@ export async function judge(ctx, { criteria, answer }) { return { pass, score: Number.isFinite(score) ? score : pass ? 1 : 0, reasoning: out.replace(/\s+/g, " ").slice(0, 200), cost, infra: null }; } - -// Pairwise preference judge for the flows-vs-plain A/B. Shows the judge BOTH answers -// to the same task and asks which better meets the criterion — far more sensitive to -// small differences than independent absolute scoring, and explicitly told NOT to -// reward length (the compact-summary bias we want to rule out). Position bias is -// handled by the caller, which runs this twice with the answers swapped. Same -// tool-less `redteam` call on the cross-vendor judge model as judge(). -export async function judgePairwise(ctx, { criteria, answerA, answerB }) { - if (ctx.dryRun) return { winner: "TIE", reasoning: "(dry-run: judge skipped)", cost: 0, infra: null }; - - const task = [ - "Two answers — A and B — respond to the SAME task. Pick the one that better satisfies the criterion below. If they meet it equally well, answer TIE.", - "Judge content and correctness only. Do NOT reward greater length: a concise correct answer is not worse than a longer one that conveys the same thing.", - `Criterion: ${criteria}`, - "", - "--- ANSWER A ---", - answerA, - "--- END ANSWER A ---", - "", - "--- ANSWER B ---", - answerB, - "--- END ANSWER B ---", - "", - "Reply with exactly two lines and nothing else:", - "WINNER: A or B or TIE", - "REASON: ", - ].join("\n"); - - const result = await ctx.flow.execute( - "eval:pairwise", - { agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: 120000 }, - new AbortController().signal, - undefined, - ctx.flowCtx, - ); - - const infra = judgeInfraError(result); - const cost = judgeCost(result); - if (infra) return { winner: "ERROR", reasoning: infra, cost, infra }; - - const out = result?.content?.[0]?.text ?? ""; - const match = out.match(/WINNER:\s*(A|B|TIE)/i); - const winner = match ? match[1].toUpperCase() : "TIE"; - return { winner, reasoning: out.replace(/\s+/g, " ").slice(0, 200), cost, infra: null }; -} diff --git a/evals/pareto.mjs b/evals/pareto.mjs new file mode 100644 index 0000000..fc678f9 --- /dev/null +++ b/evals/pareto.mjs @@ -0,0 +1,45 @@ +// FREE corpus-wide failure-mode ranking: which failure on which prompt/config +// version to fix first, joining deterministic labels, human reviews, and stored +// EvalRun scores. Thin wrapper over `thulr pareto` (no judge calls, no tokens). +// Reads the regenerated eval trace by default. +// +// npm run eval:pareto # rank by prompt version over evals/thulr-trace.jsonl +// npm run eval:pareto -- --by config-version # split by config (subject model) instead +// npm run eval:pareto -- --traces .thulr/traces # scan a directory of traces recursively +// npm run eval:pareto -- --limit 10 # show only the top N rows +// npm run eval:pareto -- --json # machine-readable +// +// Flags accept either `--name value` (as thulr's own CLI does) or `--name=value`. +import { existsSync } from "node:fs"; +import { resolve } from "node:path"; +import { parseArgs } from "./args.mjs"; +import * as thulr from "./thulr.mjs"; + +const opts = parseArgs(process.argv.slice(2)); +const tracesArg = opts.traces ?? "evals/thulr-trace.jsonl"; +const traces = resolve(process.cwd(), tracesArg); +const by = opts.by ?? "prompt-version"; +const limitOpt = opts.limit; +const json = Boolean(opts.json); + +if (!thulr.available()) { + console.error("✗ `thulr` was not found on PATH.\n Install it (e.g. `cargo install thulr`) — `thulr pareto` ranks failure modes across stored traces."); + process.exit(2); +} +if (!existsSync(traces)) { + console.error(`✗ Traces path not found: ${tracesArg}\n Run \`npm run eval\` first to produce evals/thulr-trace.jsonl, or pass --traces .`); + process.exit(2); +} +if (!["prompt-version", "config-version"].includes(by)) { + console.error(`✗ --by must be one of prompt-version | config-version, got '${by}'.`); + process.exit(2); +} + +try { + const limit = limitOpt === undefined ? undefined : Number(limitOpt); + const result = thulr.pareto({ traces, by, limit, json }); + process.stdout.write(json ? `${JSON.stringify(result, null, 2)}\n` : result); +} catch (error) { + console.error(`thulr pareto failed: ${error?.message ?? error}`); + process.exit(1); +} diff --git a/evals/review.mjs b/evals/review.mjs new file mode 100644 index 0000000..311cd5e --- /dev/null +++ b/evals/review.mjs @@ -0,0 +1,56 @@ +// Record (or list) a human SME review verdict for an eval case. The verdict set is +// folded into thulr's calibration on the next `npm run eval` as judge-vs-human +// ground truth (TPR/TNR) on top of the deterministic-label axis. Thin wrapper over +// `thulr review` that defaults the trace to evals/thulr-trace.jsonl, so the verdict +// lands at the path the main harness auto-discovers. +// +// npm run eval:review -- --list # reviewed / unreviewed case ids +// npm run eval:review -- --case route-classifies-bug-to-recon --verdict pass +// npm run eval:review -- --case single-answer-quality-judged --verdict fail --failure-mode final_answer.incomplete --note "missed the TTL bug" +// npm run eval:review -- --case vote-reaches-known-consensus --verdict unsure --reviewer justin +// npm run eval:review -- --trace evals/thulr-trace.jsonl --case x --verdict pass # explicit trace +// +// Flags accept either `--name value` (as thulr's own CLI does) or `--name=value`. +import { existsSync } from "node:fs"; +import { resolve } from "node:path"; +import { parseArgs } from "./args.mjs"; +import * as thulr from "./thulr.mjs"; + +const opts = parseArgs(process.argv.slice(2)); +const traceArg = opts.trace ?? "evals/thulr-trace.jsonl"; +const trace = resolve(process.cwd(), traceArg); +const list = Boolean(opts.list); +const caseId = opts.case ?? null; +const verdict = opts.verdict ?? null; +const failureMode = opts["failure-mode"] ?? null; +const note = opts.note ?? null; +const reviewer = opts.reviewer ?? null; + +const usage = "Record a verdict: npm run eval:review -- --case --verdict [--failure-mode ] [--note ] [--reviewer ]\nList state: npm run eval:review -- --list"; + +if (!thulr.available()) { + console.error("✗ `thulr` was not found on PATH.\n Install it (e.g. `cargo install thulr`) — `thulr review` records the SME verdicts that calibration reads."); + process.exit(2); +} +if (!existsSync(trace)) { + console.error(`✗ Trace not found: ${traceArg}\n Run \`npm run eval\` first to produce it, or pass --trace .`); + process.exit(2); +} +if (!list) { + if (!caseId || !verdict) { + console.error(usage); + process.exit(2); + } + if (!["pass", "fail", "unsure"].includes(verdict)) { + console.error(`✗ --verdict must be one of pass | fail | unsure, got '${verdict}'.\n\n${usage}`); + process.exit(2); + } +} + +try { + process.stdout.write(thulr.review({ trace, list, caseId, verdict, failureMode, note, reviewer })); + if (!list) console.log("\n✓ Recorded. The next `npm run eval` folds this into calibration (judge-vs-human TPR/TNR)."); +} catch (error) { + console.error(`thulr review failed: ${error?.message ?? error}`); + process.exit(1); +} diff --git a/evals/run.mjs b/evals/run.mjs index ddd781b..c6d5ce0 100644 --- a/evals/run.mjs +++ b/evals/run.mjs @@ -14,7 +14,9 @@ // npm run eval -- --judge-bin=/path/to/judge-wrapper # override thulr's judge command // npm run eval -- --samples=3 # judge each case 3x: majority verdict, mean score, judge-noise stddev + flake warnings // npm run eval -- --eval-set=.thulr/eval-sets/smoke.json # overlay promoted criteria/authority metadata +// npm run eval -- --reviews=.thulr/reviews/thulr-trace.reviews.json # fold human SME verdicts into calibration (judge-vs-human TPR/TNR) // npm run eval -- --efficiency-guardrail=cost_usd --efficiency-guardrail=tokens # fail on spend/size regressions +// npm run eval -- --score-guardrail=evidence_quality # also gate a named-criteria dimension's score (criterion is always gated) // npm run eval -- --noise-band=0.10 # judge/efficiency regression tolerance (default 0.05) // npm run eval -- --write-baseline # promote this run to evals/thulr-baseline.json (the gate baseline) // npm run eval -- --compare-baseline=evals/thulr-baseline.json # gate against a specific baseline @@ -40,7 +42,7 @@ // The harness emits ONE self-contained trace (evals/thulr-trace.jsonl) — each case's // answer, criterion, objective label, task text, expected behavior, failure labels, // config/prompt version, and cost/token telemetry inline — -// and shells out to the `thulr` CLI (0.1.2) for judge -> calibrate -> gate -> +// and shells out to the `thulr` CLI (0.1.3) for judge -> calibrate -> gate -> // baseline. thulr reads everything from the trace, so there are no separate // cases-manifest or labels files. // @@ -50,7 +52,7 @@ // (thulr run-experiment / optimize) owns judging and selection. import { execFileSync } from "node:child_process"; import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; -import { dirname, join, resolve } from "node:path"; +import { basename, dirname, join, resolve } from "node:path"; import { CALIBRATION_CASES, CASES } from "./cases.mjs"; import { caseCwd, flowTool, scoreObjective, subjectModelName, sumTokens, DEFAULT_EVAL_MODEL } from "./lib.mjs"; import { injectModel } from "./model-injection.mjs"; @@ -96,6 +98,11 @@ const evalSet = flag("eval-set", null); const redaction = flag("redaction", null); const rate = Number(flag("rate", "0")); const efficiencyGuardrails = flags("efficiency-guardrail"); +// Opt-in per-dimension SCORE guardrails beyond the always-on `criterion`: name a +// thulr.criteria. (e.g. --score-guardrail=evidence_quality) to fail the +// gate when that named dimension's mean score regresses. Off by default so a new +// dimension is observed for a few runs before it can block. +const extraScoreGuardrails = flags("score-guardrail"); const noiseBand = Number(flag("noise-band", "0.05")); // Emit-the-trace-and-stop: the command-template mode `thulr run-experiment` and // `thulr optimize` drive ("the template MUST emit a structured JSONL trace to {out}"). @@ -126,6 +133,14 @@ const compareFlag = flag("compare-baseline", null); const gateBaseline = compareFlag ? p(compareFlag) : existsSync(BASELINE_DEFAULT) ? BASELINE_DEFAULT : null; const writeBaseline = has("write-baseline") ? p(flag("write-baseline", "") || BASELINE_DEFAULT) : null; const LABELS = p(".thulr/runs/candidate.labels.json"); +// Human-review calibration (thulr review): an SME verdict set folded into +// `thulr calibrate` as judge-vs-human ground truth (TPR/TNR) on top of the +// deterministic-label axis. Defaults to the path `thulr review` writes for this +// trace, so recording a verdict (`npm run eval:review -- --case --verdict +// `) is enough — the next `npm run eval` picks it up with no flag. +const reviewsFlag = flag("reviews", null); +const reviewsDefault = p(`.thulr/reviews/${basename(TRACE).replace(/\.jsonl$/, "")}.reviews.json`); +const reviews = reviewsFlag ? p(reviewsFlag) : existsSync(reviewsDefault) ? reviewsDefault : null; function preflight() { if (dryRun) return true; @@ -208,6 +223,7 @@ async function main() { name: testCase.name, answer, criterion: testCase.criterion, + namedCriteria: testCase.namedCriteria, label: !!objective.pass, endMs: endedAt, model: subjectModelName(result, useAgentModels ? "agent-frontmatter" : model), @@ -312,15 +328,20 @@ async function main() { if (crit) console.log(` judge noise across ${samples} samples: score stddev ±${(crit.score_stddev ?? 0).toFixed(3)}`); } - // Calibration: how well the judge's verdicts track the deterministic labels. + // Calibration: how well the judge's verdicts track the deterministic labels + // (and human SME verdicts too, when a review set is present — judge-vs-human + // TPR/TNR). thulr 0.1.3 also queues every judge/ground-truth disagreement onto + // the triage queue (`thulr queue`) and feeds this calibration into the gate: + // a judge blind in either direction downgrades a clean PASS to WARN. console.log(""); - process.stdout.write(thulr.calibrate(CANDIDATE, { labels: LABELS })); + process.stdout.write(thulr.calibrate(CANDIDATE, { labels: LABELS, reviews })); + if (reviews) console.log(`folded human review verdicts from ${rel(reviews)} into calibration (judge-vs-human TPR/TNR above).`); if (calibrationSummaries.length) { console.log(`release gate excludes ${calibrationSummaries.length} calibration canar${calibrationSummaries.length === 1 ? "y" : "ies"} from pass-rate comparison; full judged run remains ${rel(CANDIDATE)}.`); } if (gateBaseline) { - const gateOptions = { baseline: gateBaseline, candidate: gateCandidate, guardrails: ["criterion"], scoreGuardrails: ["criterion"], efficiencyGuardrails, noiseBand, redaction }; + const gateOptions = { baseline: gateBaseline, candidate: gateCandidate, guardrails: ["criterion"], scoreGuardrails: [...new Set(["criterion", ...extraScoreGuardrails])], efficiencyGuardrails, noiseBand, redaction }; try { const gateJson = thulr.gate({ ...gateOptions, json: true }); const deltaLines = thulr.formatGateScoreSummary(gateJson.report); diff --git a/evals/thulr.mjs b/evals/thulr.mjs index 3486ec5..dd120b4 100644 --- a/evals/thulr.mjs +++ b/evals/thulr.mjs @@ -3,10 +3,10 @@ // shells out to thulr for the judge -> calibrate -> gate -> baseline pipeline, // replacing the harness's old in-process LLM judge + hand-rolled baseline compare. // -// thulr (0.1.2 contract, docs/trace-contract.md in the thulr repo) ingests a -// SELF-CONTAINED trace. Each case's criterion and its deterministic objective -// label travel INLINE in the span attributes, established from thulr's -// openinference_trace adapter: +// thulr (0.1.3 contract — additive over 0.1.2, docs/trace-contract.md in the +// thulr repo) ingests a SELF-CONTAINED trace. Each case's criterion and its +// deterministic objective label travel INLINE in the span attributes, +// established from thulr's openinference_trace adapter: // thulr.case_id – the case identifier (thulr groups spans by this) // thulr.criterion – the one literal criterion the judge grades against // thulr.deterministic_label – the objective pass/fail (boolean) for calibration @@ -36,10 +36,17 @@ const spanId = () => randomUUID().replace(/-/g, ""); * inspect/label workflows while preserving the "latest output.value wins" rule. * Returned (not written) so the caller controls the file. * - * @param {{name: string, answer: string, criterion: string, label?: boolean, endMs: number, model?: string, task?: string, expectedBehavior?: string, failureModes?: string[], costUsd?: number, tokensTotal?: number, promptVersion?: string, configVersion?: string}} input + * `namedCriteria` adds thulr 0.1.3 multi-dimension judging: each `{ dimension: + * "criterion text" }` entry is emitted as a `thulr.criteria.` attribute + * on the graded span and judged into its own dimension alongside the required + * `thulr.criterion` — per-dimension pass-rate, score delta, and calibration. The + * dimension name must be non-empty, whitespace-free, and not `criterion`, or thulr + * fails ingestion naming the offending key. + * + * @param {{name: string, answer: string, criterion: string, label?: boolean, endMs: number, model?: string, task?: string, expectedBehavior?: string, failureModes?: string[], costUsd?: number, tokensTotal?: number, promptVersion?: string, configVersion?: string, namedCriteria?: Record}} input * @returns {object[]} */ -export function traceSpansForCase({ name, answer, criterion, label, endMs, model, task, expectedBehavior, failureModes, costUsd, tokensTotal, promptVersion, configVersion }) { +export function traceSpansForCase({ name, answer, criterion, label, endMs, model, task, expectedBehavior, failureModes, costUsd, tokensTotal, promptVersion, configVersion, namedCriteria }) { const traceId = spanId(); const rootSpanId = spanId(); const answerSpanId = spanId(); @@ -60,6 +67,11 @@ export function traceSpansForCase({ name, answer, criterion, label, endMs, model const finalAttributes = { ...commonAttributes, "output.value": answer }; if (costUsd !== undefined) finalAttributes["thulr.cost_usd"] = costUsd; if (tokensTotal !== undefined) finalAttributes["llm.token_count.total"] = tokensTotal; + // Multi-dimension judging: each non-empty named criterion rides the graded span + // as thulr.criteria.; thulr grades each into its own dimension. + for (const [dimension, value] of Object.entries(namedCriteria ?? {})) { + if (value) finalAttributes[`thulr.criteria.${dimension}`] = value; + } return [ { trace_id: traceId, @@ -165,6 +177,72 @@ export function calibrateArgs({ evalRun, labels, reviews }) { return args; } +/** + * Build the `thulr duel` argv. Pairwise, position-swapped RELATIVE judging — the + * sensitive head-to-head metric that absolute per-dimension scoring can't resolve + * when both arms pass at ~1.0. thulr pairs the two arm traces by `thulr.case_id`, + * judges each shared case twice with the arms swapped, and counts a win only when + * both orderings agree; opposite preferences are a `flip` (judge position bias) + * and are excluded from the win rate. This replaces the harness's old hand-rolled + * pairwise judge. Spends two judge-model calls per shared case. + * + * @param {{traceA: string, traceB: string, labelA?: string, labelB?: string, model?: string, out?: string, evalSet?: string, concurrency?: number, judgeBin?: string, json?: boolean}} input + * @returns {string[]} + */ +export function duelArgs({ traceA, traceB, labelA, labelB, model, out, evalSet, concurrency, judgeBin, json }) { + const args = ["duel"]; + if (json) args.push("--json"); + if (labelA) args.push("--label-a", labelA); + if (labelB) args.push("--label-b", labelB); + if (model) args.push("--model", model); + if (out) args.push("--out", out); + if (evalSet) args.push("--eval-set", evalSet); + if (concurrency) args.push("--concurrency", String(concurrency)); + if (judgeBin) args.push("--judge-bin", judgeBin); + args.push(traceA, traceB); + return args; +} + +/** + * Build the `thulr pareto` argv. FREE corpus-wide failure-mode ranking: which + * failure on which prompt/config version to fix first, joining deterministic + * labels, human reviews, and stored EvalRun scores. No judge calls. + * + * @param {{traces?: string, by?: "prompt-version" | "config-version", limit?: number, json?: boolean}} input + * @returns {string[]} + */ +export function paretoArgs({ traces, by, limit, json } = {}) { + const args = ["pareto"]; + if (json) args.push("--json"); + if (traces) args.push("--traces", traces); + if (by) args.push("--by", by); + if (limit !== undefined) args.push("--limit", String(limit)); + return args; +} + +/** + * Build the `thulr review` argv. Records (or `--list`s) a human SME verdict for + * one trace case into a `thulr.review_set.v1` artifact; `calibrate --reviews` + * then measures judge-vs-human TPR/TNR on top of the deterministic-label axis. + * One verdict per invocation. + * + * @param {{trace: string, out?: string, list?: boolean, caseId?: string, verdict?: "pass" | "fail" | "unsure", failureMode?: string, note?: string, reviewer?: string, json?: boolean}} input + * @returns {string[]} + */ +export function reviewArgs({ trace, out, list, caseId, verdict, failureMode, note, reviewer, json }) { + const args = ["review"]; + if (json) args.push("--json"); + args.push("--trace", trace); + if (list) args.push("--list"); + if (caseId) args.push("--case", caseId); + if (verdict) args.push("--verdict", verdict); + if (failureMode) args.push("--failure-mode", failureMode); + if (note) args.push("--note", note); + if (reviewer) args.push("--reviewer", reviewer); + if (out) args.push("--out", out); + return args; +} + const fixed = (value, digits) => Number.isFinite(value) ? value.toFixed(digits) : "n/a"; const signedFixed = (value, digits) => Number.isFinite(value) ? `${value >= 0 ? "+" : ""}${value.toFixed(digits)}` : "n/a"; const pct = (value) => Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a"; @@ -213,6 +291,33 @@ export function formatGateScoreSummary(report) { return lines; } +/** + * Summarize a `thulr.duel_report.v1` as the lines pi-flows should print: the + * head-to-head win counts, the win rate over swap-consistent cases, the margin + * for arm A, and any position-bias flips (judge noise) or skipped cases. Arm A is + * the flows arm and arm B is plain — the harness passes `--label-a`/`--label-b`, + * so labels travel in the report. Pure (string in / lines out) for unit testing. + * + * @param {object | string | null | undefined} report + * @returns {string[]} + */ +export function formatDuelSummary(report) { + const parsed = typeof report === "string" ? JSON.parse(report) : report; + const s = parsed?.summary; + if (!s) return []; + const aLabel = parsed.a?.label ?? "A"; + const bLabel = parsed.b?.label ?? "B"; + const lines = [ + `${aLabel} wins ${s.a_wins} · ${bLabel} wins ${s.b_wins} · ties ${s.ties}${s.flips ? ` · flips ${s.flips}` : ""} (decided ${s.decided})`, + `win rate: ${aLabel} ${pct(s.win_rate_a)} · ${bLabel} ${pct(s.win_rate_b)} · margin ${aLabel} ${signedFixed(s.margin_a, 3)}`, + ]; + const skipped = parsed.skipped ?? []; + if (skipped.length) { + lines.push(`skipped ${skipped.length}: ${skipped.map((x) => (Array.isArray(x) ? `${x[0]} (${x[1]})` : String(x))).join(", ")}`); + } + return lines; +} + function summarizeCases(cases) { const dimensions = new Map(); for (const c of cases) { @@ -345,6 +450,33 @@ export function calibrate(evalRun, options = {}) { return run(calibrateArgs({ evalRun, ...options })).stdout; } +/** + * Pairwise-duel two arm traces head-to-head (`thulr duel`). Spends judge tokens + * (two calls per shared case). With `json: true` returns the parsed + * `thulr.duel_report.v1`; otherwise the human-readable report text. The `--out` + * artifact is also persisted when `out` is given. + */ +export function duel(options) { + const { stdout } = run(duelArgs(options)); + return options.json ? JSON.parse(stdout) : stdout; +} + +/** Rank failure modes across stored traces (`thulr pareto`). Free — no judge calls. */ +export function pareto(options = {}) { + const { stdout } = run(paretoArgs(options)); + return options.json ? JSON.parse(stdout) : stdout; +} + +/** + * Record (or `--list`) a human review verdict for a trace case (`thulr review`). + * Free. Writes a `thulr.review_set.v1` artifact that `calibrate --reviews` + * consumes as judge-vs-human ground truth. + */ +export function review(options) { + const { stdout } = run(reviewArgs(options)); + return options.json ? JSON.parse(stdout) : stdout; +} + /** * Gate a candidate EvalRun against a baseline EvalRun. Free. Returns the exit * code (10 = FAIL), whether it blocks, and the report — human-readable by diff --git a/package.json b/package.json index 947dbab..2346622 100644 --- a/package.json +++ b/package.json @@ -68,6 +68,8 @@ "validate:agents": "node scripts/validate-agents.mjs", "eval": "node --import tsx evals/run.mjs", "eval:compare": "node --import tsx evals/compare.mjs", + "eval:review": "node evals/review.mjs", + "eval:pareto": "node evals/pareto.mjs", "trace:report": "node scripts/trace-report.mjs" } } diff --git a/tests/eval-args.test.ts b/tests/eval-args.test.ts new file mode 100644 index 0000000..48405b3 --- /dev/null +++ b/tests/eval-args.test.ts @@ -0,0 +1,23 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { parseArgs } from "../evals/args.mjs"; + +// The eval CLI wrappers (review.mjs, pareto.mjs) accept both `--name value` (the +// style thulr's own CLI uses) and `--name=value` (the harness style), plus bare +// boolean flags. A regression here silently drops a documented flag. +test("parseArgs accepts space-separated and =-separated flags and bare booleans", () => { + assert.deepEqual( + parseArgs(["--case", "route-x", "--verdict=fail", "--note", "missed the TTL bug", "--list"]), + { case: "route-x", verdict: "fail", note: "missed the TTL bug", list: true }, + ); +}); + +// A bare flag immediately before another flag must stay boolean, not swallow it. +test("parseArgs treats a flag followed by another flag as boolean", () => { + assert.deepEqual(parseArgs(["--list", "--json"]), { list: true, json: true }); +}); + +// Positionals are ignored; a repeated flag keeps the last value. +test("parseArgs ignores positionals and keeps the last value for repeats", () => { + assert.deepEqual(parseArgs(["pos", "--by", "prompt-version", "--by", "config-version"]), { by: "config-version" }); +}); diff --git a/tests/thulr-bridge.test.ts b/tests/thulr-bridge.test.ts index 457594f..132d041 100644 --- a/tests/thulr-bridge.test.ts +++ b/tests/thulr-bridge.test.ts @@ -1,6 +1,6 @@ import { test } from "node:test"; import assert from "node:assert/strict"; -import { traceSpansForCase, gateBlocks, gateArgs, judgeArgs, calibrateArgs, inspectTraceArgs, labelFailuresArgs, formatGateScoreSummary, gateCandidateForEvalRun } from "../evals/thulr.mjs"; +import { traceSpansForCase, gateBlocks, gateArgs, judgeArgs, calibrateArgs, inspectTraceArgs, labelFailuresArgs, formatGateScoreSummary, gateCandidateForEvalRun, duelArgs, paretoArgs, reviewArgs, formatDuelSummary } from "../evals/thulr.mjs"; // thulr ingests a SELF-CONTAINED trace: each case's criterion and its // deterministic (objective) label travel INLINE in the span attributes — no more @@ -111,7 +111,7 @@ test("gateArgs passes both the pass-rate and mean-score guardrails", () => { ]); }); -// `--format junit` (thulr 0.1.2) replaces the terminal report on stdout with a +// `--format junit` (thulr 0.1.2+) replaces the terminal report on stdout with a // JUnit XML testsuite for CI test ingestion; exit codes are unchanged. test("gateArgs renders the CI-native JUnit format when asked", () => { const args = gateArgs({ baseline: "b.json", candidate: "c.json", format: "junit" }); @@ -171,7 +171,7 @@ test("gateCandidateForEvalRun excludes calibration canaries and recomputes summa assert.equal(gateRun.summary[0].score_mean, 0.75); }); -// Judge repeat-sampling (thulr 0.1.2): `--samples N` judges each case N times and +// Judge repeat-sampling (thulr 0.1.2+): `--samples N` judges each case N times and // aggregates (majority verdict, mean score). N=1 is the default — no flag emitted, // byte-identical to single-sample judging. test("judgeArgs passes --samples only when repeat-sampling is on", () => { @@ -185,8 +185,83 @@ test("judgeArgs passes --samples only when repeat-sampling is on", () => { ); }); -test("trace inspection, failure labels, and calibration args match thulr 0.1.2", () => { +test("trace inspection, failure labels, and calibration args match thulr 0.1.3", () => { assert.deepEqual(inspectTraceArgs({ trace: "t.jsonl" }), ["inspect-trace", "--trace", "t.jsonl", "--json"]); assert.deepEqual(labelFailuresArgs({ trace: "t.jsonl", out: "labels.json" }), ["label-failures", "--trace", "t.jsonl", "--out", "labels.json"]); assert.deepEqual(calibrateArgs({ evalRun: "run.json", labels: "labels.json", reviews: "reviews.json" }), ["calibrate", "--labels", "labels.json", "--reviews", "reviews.json", "run.json"]); }); + +// `thulr duel` (0.1.3) is the relative, position-swapped head-to-head the A/B uses +// instead of a hand-rolled pairwise judge. Traces are the trailing positionals; +// --judge-bin rides the same wrapper as judge so extension providers stay available. +test("duelArgs builds the pairwise duel argv; traces are the trailing positionals", () => { + assert.deepEqual( + duelArgs({ traceA: "a.jsonl", traceB: "b.jsonl", labelA: "flows", labelB: "plain", model: "anthropic/claude-haiku-4-5", out: "duel.json", concurrency: 4, judgeBin: "scripts/thulr-judge-pi.sh", json: true }), + ["duel", "--json", "--label-a", "flows", "--label-b", "plain", "--model", "anthropic/claude-haiku-4-5", "--out", "duel.json", "--concurrency", "4", "--judge-bin", "scripts/thulr-judge-pi.sh", "a.jsonl", "b.jsonl"], + ); + assert.deepEqual(duelArgs({ traceA: "a.jsonl", traceB: "b.jsonl" }), ["duel", "a.jsonl", "b.jsonl"]); +}); + +// `thulr pareto` (0.1.3) ranks failure modes across stored traces — free, no judge calls. +test("paretoArgs builds the failure-mode ranking argv", () => { + assert.deepEqual( + paretoArgs({ traces: "evals/thulr-trace.jsonl", by: "config-version", limit: 10, json: true }), + ["pareto", "--json", "--traces", "evals/thulr-trace.jsonl", "--by", "config-version", "--limit", "10"], + ); + assert.deepEqual(paretoArgs(), ["pareto"]); +}); + +// `thulr review` (0.1.3) records one human SME verdict per invocation, or --lists state. +test("reviewArgs records one verdict and lists state", () => { + assert.deepEqual( + reviewArgs({ trace: "t.jsonl", caseId: "route-x", verdict: "fail", failureMode: "tool.error", note: "missed it", reviewer: "justin" }), + ["review", "--trace", "t.jsonl", "--case", "route-x", "--verdict", "fail", "--failure-mode", "tool.error", "--note", "missed it", "--reviewer", "justin"], + ); + assert.deepEqual(reviewArgs({ trace: "t.jsonl", list: true, json: true }), ["review", "--json", "--trace", "t.jsonl", "--list"]); +}); + +// The duel report (thulr.duel_report.v1): arm A is flows, arm B is plain. Lead with +// win counts, the win rate over swap-consistent cases, the A-margin, then flips +// (position bias) and any skipped cases. +test("formatDuelSummary leads with win counts, win rate, margin, flips, and skips", () => { + const lines = formatDuelSummary({ + a: { label: "flows" }, + b: { label: "plain" }, + summary: { a_wins: 3, b_wins: 1, ties: 1, flips: 1, decided: 5, win_rate_a: 0.6, win_rate_b: 0.2, margin_a: 0.4 }, + skipped: [["lonely-case", "only_in_a"]], + }); + assert.deepEqual(lines, [ + "flows wins 3 · plain wins 1 · ties 1 · flips 1 (decided 5)", + "win rate: flows 60.0% · plain 20.0% · margin flows +0.400", + "skipped 1: lonely-case (only_in_a)", + ]); +}); + +test("formatDuelSummary returns nothing without a summary", () => { + assert.deepEqual(formatDuelSummary(null), []); + assert.deepEqual(formatDuelSummary({}), []); +}); + +// thulr 0.1.3 multi-dimension judging: namedCriteria ride the graded span as +// thulr.criteria., judged into their own dimensions alongside criterion. +// Empty values are dropped; the root (context) span does not carry them. +test("traceSpansForCase emits named criteria on the graded span only", () => { + const spans = traceSpansForCase({ + name: "c", + answer: "a", + criterion: "primary", + endMs: 1, + namedCriteria: { evidence_quality: "cites the specific code", impact_explanation: "states the production impact", blank: "" }, + }); + const root = spans[0]; + const graded = spans.find((s) => s.attributes["output.value"] !== undefined); + assert.equal(graded.attributes["thulr.criteria.evidence_quality"], "cites the specific code"); + assert.equal(graded.attributes["thulr.criteria.impact_explanation"], "states the production impact"); + assert.equal("thulr.criteria.blank" in graded.attributes, false, "empty dimension values are skipped"); + assert.equal("thulr.criteria.evidence_quality" in root.attributes, false, "named criteria belong to the graded span"); +}); + +test("traceSpansForCase omits the named-criteria attributes when none are given", () => { + const graded = traceSpansForCase({ name: "c", answer: "a", criterion: "x", endMs: 1 }).find((s) => s.attributes["output.value"] !== undefined); + assert.equal(Object.keys(graded.attributes).some((k) => k.startsWith("thulr.criteria.")), false); +});