From 5fdf012acbf7f3e3713b6c6693a8c1c6e90a5e48 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justinramos@outlook.com>
Date: Sat, 13 Jun 2026 11:19:16 -0600
Subject: [PATCH] =?UTF-8?q?feat(evals):=20adopt=20thulr=200.1.3=20?=
 =?UTF-8?q?=E2=80=94=20duel,=20review=20calibration,=20pareto,=20named=20c?=
 =?UTF-8?q?riteria?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- eval:compare --pairwise runs thulr's position-swapped `duel` (relative win-rate
  judging; flips = judge position bias) over one self-contained trace per arm,
  replacing the hand-rolled in-process pairwise judge
- human-review calibration: `eval:review` records SME verdicts; `eval` auto-folds
  them into `calibrate --reviews` (judge-vs-human TPR/TNR)
- `eval:pareto` ranks failure modes across stored traces (free, no judge calls)
- multi-dimension named criteria (thulr.criteria.<dim>) on the review cases
  (evidence_quality, impact_explanation) for score headroom; opt-in
  --score-guardrail=<dim>
- eval:compare: configurable --timeout (subject + judge agents) and
  comma-separated --filter; judge.mjs no longer hardcodes a 120s judge cap
- broaden the session-cache existence-check scorer (fixes a confirmed false
  negative where the judge was right and the regex missed the phrasing)
- bridge: duelArgs/paretoArgs/reviewArgs/formatDuelSummary + tests; shared arg
  parser; 0.1.2 -> 0.1.3 docs/stamps

evals/ is not packaged (.npmignore), so this is dev-tooling only — no version bump.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md               |  10 +++
 evals/README.md            |  80 ++++++++++++++++---
 evals/args.mjs             |  27 +++++++
 evals/cases.mjs            |  20 ++++-
 evals/compare.mjs          | 152 +++++++++++++++++++++++++------------
 evals/judge.mjs            |  47 +-----------
 evals/pareto.mjs           |  45 +++++++++++
 evals/review.mjs           |  56 ++++++++++++++
 evals/run.mjs              |  31 ++++++--
 evals/thulr.mjs            | 144 +++++++++++++++++++++++++++++++++--
 package.json               |   2 +
 tests/eval-args.test.ts    |  23 ++++++
 tests/thulr-bridge.test.ts |  83 +++++++++++++++++++-
 13 files changed, 600 insertions(+), 120 deletions(-)
 create mode 100644 evals/args.mjs
 create mode 100644 evals/pareto.mjs
 create mode 100644 evals/review.mjs
 create mode 100644 tests/eval-args.test.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d5cf69e..2f722fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,16 @@ that must agree are `package.json`, `PI_FLOWS_VERSION` in
   summary now prints thulr's numeric score, pass-rate, and efficiency deltas from
   `thulr gate --json` before the human gate report, and `--noise-band=<n>` makes
   guardrail tolerance explicit.
+- Evals: adopt thulr 0.1.3. `npm run eval:compare -- --pairwise` now runs thulr's
+  calibrated, position-swapped **`duel`** (relative win-rate judging, flips reported
+  as judge position bias) over one self-contained trace per arm, replacing the
+  harness's hand-rolled in-process pairwise judge. `npm run eval:review` records
+  human SME verdicts and `npm run eval` folds them into calibration as a second
+  ground-truth axis (`--reviews`; judge-vs-human TPR/TNR), auto-discovering
+  `.thulr/reviews/<trace>.reviews.json`. `npm run eval:pareto` ranks failure modes
+  across stored traces (which failure on which prompt/config version to fix first).
+  Calibration also surfaces thulr 0.1.3's judge-trust gate: a judge blind in either
+  direction downgrades a clean gate PASS to WARN.
 - Vote/orchestrate quality: same-agent/model voters now receive complementary
   stances so ballots are not identical prompt replays, and orchestrate workers
   now see the overall goal/contract alongside their assigned subtask before
diff --git a/evals/README.md b/evals/README.md
index 5e63ad6..e7f5a2c 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -38,6 +38,7 @@ npm run eval -- --judge-model=anthropic/claude-opus-4-8   # thulr judge model (d
 npm run eval -- --judge-bin=/path/to/judge-wrapper   # override thulr's judge command
 npm run eval -- --samples=3        # judge each case 3×: majority verdict, mean score, flake warnings (3× judge spend)
 npm run eval -- --eval-set=.thulr/eval-sets/release.json   # overlay promoted criteria / guardrail authority
+npm run eval -- --reviews=.thulr/reviews/thulr-trace.reviews.json   # fold human SME verdicts into calibration (judge-vs-human TPR/TNR)
 npm run eval -- --efficiency-guardrail=cost_usd --efficiency-guardrail=tokens   # fail on spend/token regressions
 npm run eval -- --noise-band=0.10  # regression tolerance for score/pass-rate/efficiency guardrails (default 0.05)
 npm run eval -- --cap=1.00         # per-case USD ceiling on flow delegations (default 0.50)
@@ -76,7 +77,7 @@ verifies the binary, workspace, store, and that thulr's judge binary `pi` resolv
 3. `thulr label-failures --trace <file>` applies thulr's failure-mode ontology
    and writes labels for calibration/triage.
 4. `thulr judge --trace <file>` grades each case's answer against its inline
-   `criterion` → an EvalRun. thulr (0.1.2) reads everything from the trace — no
+   `criterion` → an EvalRun. thulr (0.1.3) reads everything from the trace — no
    separate cases-manifest or labels files. With `--samples=N` each case is judged
    N times and aggregated (majority verdict, ties fail safe; mean score) — the
    EvalRun's `score_stddev` then reports **judge noise** instead of cross-case
@@ -86,6 +87,12 @@ verifies the binary, workspace, store, and that thulr's judge binary `pi` resolv
    — how well the judge's verdicts track the inline deterministic labels, with
    failure labels included in the report. (An uncalibrated judge can silently
    certify regressions; this is the calibration the old single-judge setup lacked.)
+   Record human SME verdicts with `npm run eval:review` and the harness folds them
+   in as a second ground-truth axis (`--reviews`; judge-vs-human TPR/TNR) — see
+   [Human review & failure triage](#human-review--failure-triage). thulr 0.1.3 also
+   queues every judge/ground-truth disagreement onto `thulr queue` and feeds this
+   calibration into the gate: a judge blind in either direction (TPR or TNR 0% over
+   labeled cases) downgrades a clean PASS to WARN with the dimension named.
 6. Before gating, pi-flows writes `.thulr/runs/candidate.gate.json`, which is the
    judged EvalRun with calibration canaries filtered out and summaries
    recomputed. `thulr gate` compares that gate candidate to
@@ -224,7 +231,7 @@ and the same cross-model judge:
 
 ```bash
 npm run eval:compare                    # all cases, both arms
-npm run eval:compare -- --pairwise      # add order-controlled pairwise judging (the sensitive metric)
+npm run eval:compare -- --pairwise      # add thulr's relative duel (the sensitive metric)
 npm run eval:compare -- --filter=vote   # scope to keep cost down (runs both arms per case)
 npm run eval:compare -- --write=evals/compare.json
 npm run eval:compare -- --dry-run       # wiring smoke, no model
@@ -235,14 +242,50 @@ PI_FLOWS_TRACE_FILE=/tmp/ab.jsonl npm run eval:compare -- --pairwise --write=eva
 npm run trace:report -- /tmp/ab.jsonl
 ```
 
-`eval:compare` keeps its own **order-controlled pairwise** judge (run twice with
-positions swapped, scored a win only when both orderings agree, told *not* to
-reward length) — the sensitive head-to-head metric for small gaps that thulr's
-absolute per-dimension scoring can't resolve. A few objective checks are
-pi-flows-only by construction (route dispatch, the same-model vote warning); plain
-pi can't satisfy them, so read those as *capabilities flows adds*, not plain losses.
-Give a case a `baselinePrompt` when its flow params encode goal info outside `task`
-(e.g. a return contract) so the plain arm is graded on the same goal.
+With `--pairwise` the harness emits one self-contained trace per arm and shells out
+to **`thulr duel`** (0.1.3) — thulr's calibrated relative judge. It pairs the arms
+by case id, judges each shared case **twice with the answers swapped**, and counts a
+win only when both orderings agree; opposite preferences are a **flip** (judge
+position bias), reported as judge noise and excluded from the win rate. This is the
+sensitive head-to-head metric for small gaps that thulr's absolute per-dimension
+scoring can't resolve — and it replaces the harness's old in-process pairwise judge.
+The duel spends two judge calls per eligible case (both arms must have reached the
+model) and persists a `thulr.duel_report.v1` at `.thulr/runs/compare-duel.json`. A
+few objective checks are pi-flows-only by construction (route dispatch, the
+same-model vote warning); plain pi can't satisfy them, so read those as *capabilities
+flows adds*, not plain losses. Give a case a `baselinePrompt` when its flow params
+encode goal info outside `task` (e.g. a return contract) so the plain arm is graded
+on the same goal.
+
+## Human review & failure triage
+
+Two free (no judge tokens) thulr 0.1.3 workflows close the loop on judged runs.
+
+**Record human verdicts** so calibration measures the judge against a person, not
+only the deterministic labels:
+
+```bash
+npm run eval:review -- --list                              # reviewed / unreviewed case ids for the last trace
+npm run eval:review -- --case single-answer-quality-judged --verdict pass
+npm run eval:review -- --case route-classifies-bug-to-recon --verdict fail \
+  --failure-mode routing.wrong_agent --note "should have gone to recon"
+```
+
+Verdicts land in `.thulr/reviews/thulr-trace.reviews.json` — the path the next
+`npm run eval` auto-discovers — so a recorded verdict needs no flag on the next run.
+`calibrate` then reports a **human** section (judge-vs-human TPR/TNR), and human
+verdicts take precedence over auto labels for the cases they cover. Point at an
+explicit set with `npm run eval -- --reviews=<path>`.
+
+**Rank failure modes across every stored trace** — which failure on which prompt or
+config version to fix first, joining deterministic labels, human reviews, and stored
+EvalRun scores:
+
+```bash
+npm run eval:pareto                         # rank by prompt version over evals/thulr-trace.jsonl
+npm run eval:pareto -- --by=config-version  # split by subject config instead
+npm run eval:pareto -- --limit=10           # top N rows
+```
 
 ## Experiments: champion/challenger (and the optimizer)
 
@@ -302,6 +345,9 @@ Append to `cases.mjs`:
   params: { agent: "recon", task: "…" },   // the flow tool input
   cwd: "/optional/working/dir",
   criterion: "One strict, literal statement a correct answer must satisfy.",  // graded by thulr's judge
+  namedCriteria: {                           // optional: extra judge dimensions (0.1.3)
+    evidence_quality: "Each claim cites the specific code it refers to.",
+  },
   score(result, ctx) {                       // objective, deterministic check
     const ok = /expected/.test(result.content[0].text);
     return { pass: ok, score: ok ? 1 : 0, notes: "…" };
@@ -316,6 +362,15 @@ single literal statement of what a correct answer must say; thulr grades the ans
 text against it on a different vendor than the subject. Always provide a `mock` so
 `--dry-run` can exercise the runner — and the artifact emission — offline.
 
+**Named criteria (`namedCriteria`)** add thulr 0.1.3 multi-dimension judging: each
+`{ dimension: "criterion text" }` entry is emitted as `thulr.criteria.<dimension>`
+on the graded span and judged into **its own dimension** alongside the required
+`criterion` — with its own pass-rate, score delta, and calibration. Use them for
+*orthogonal* quality axes (e.g. `evidence_quality`, `impact_explanation`) so a
+near-saturated case still produces a gradient. Dimension names must be non-empty,
+whitespace-free, and not `criterion`. They are observed by default; gate one with
+`--score-guardrail=<dimension>` once it looks stable.
+
 ### Hard cases (`hard: true`)
 
 For **score-tracked** cases — ones that intentionally land mid-scale so a better
@@ -327,7 +382,10 @@ the run to be green — only a regression in their mean score blocks. Keep the `
 a *complete* answer so `--dry-run` stays green. See `review-finds-all-webhook-defects`
 (4 defects) and `review-finds-session-cache-defects` (3 defects) — multi-defect code
 reviews where a typical pass misses the subtler ones (signature verification, TTL
-validation), so a sharper prompt has room to climb.
+validation), so a sharper prompt has room to climb. Both also carry `namedCriteria`
+(`evidence_quality`, `impact_explanation`) so the judge grades *how well* each defect
+is explained, not just whether all were found — extra headroom on cases that would
+otherwise saturate at "found them all."
 
 A *frontier* subject model exhausts these small fixtures (it finds every defect), so
 the score pins at 1.0 with no headroom. Rather than pin a different model per case,
diff --git a/evals/args.mjs b/evals/args.mjs
new file mode 100644
index 0000000..9bea569
--- /dev/null
+++ b/evals/args.mjs
@@ -0,0 +1,27 @@
+// Tiny argv parser for the eval CLI wrappers (review.mjs, pareto.mjs). Accepts both
+// `--name value` (the style thulr's own CLI uses) and `--name=value` (the style the
+// rest of the harness uses), plus bare boolean flags (`--list`, `--json`). Returns a
+// plain object keyed by flag name. A token that itself starts with `--` is never
+// consumed as a value, so a bare flag immediately before another flag stays boolean.
+// Repeated flags keep the last value; positionals are ignored.
+export function parseArgs(argv) {
+	const opts = {};
+	for (let i = 0; i < argv.length; i++) {
+		const a = argv[i];
+		if (!a.startsWith("--")) continue;
+		const eq = a.indexOf("=");
+		if (eq !== -1) {
+			opts[a.slice(2, eq)] = a.slice(eq + 1);
+			continue;
+		}
+		const name = a.slice(2);
+		const next = argv[i + 1];
+		if (next !== undefined && !next.startsWith("--")) {
+			opts[name] = next;
+			i += 1;
+		} else {
+			opts[name] = true;
+		}
+	}
+	return opts;
+}
diff --git a/evals/cases.mjs b/evals/cases.mjs
index 3a47126..eed5839 100644
--- a/evals/cases.mjs
+++ b/evals/cases.mjs
@@ -148,6 +148,13 @@ export const CASES = [
 		cwd: fixturesRepo,
 		baselinePrompt: "Review billing-webhook.js for ALL production-correctness defects, not just the most obvious one. Name each distinct defect and why it matters.",
 		criterion: "The review identifies ALL FOUR distinct defects: (1) recordPayment references `ledger`, which is never declared/initialized, so every call throws a ReferenceError (500); (2) no idempotency/deduplication, so a duplicate or retried delivery double-counts the payment; (3) no verification of the webhook's signature/authenticity, so a forged request is accepted as a real payment; (4) no input validation or error handling, so a malformed `req.body.data.object` throws unhandled and 500s. Fewer than four is incomplete.",
+		// Orthogonal quality dimensions (0.1.3 multi-dimension judging) — graded
+		// alongside the completeness `criterion` above so a review that names defects
+		// but explains them shallowly scores lower here, giving the suite headroom.
+		namedCriteria: {
+			evidence_quality: "Every defect named is pinned to the specific code that causes it (e.g. `recordPayment`/the undeclared `ledger`, `req.body.data.object`, the missing signature check) rather than described in vague or generic terms.",
+			impact_explanation: "Every defect states its concrete production impact (e.g. a ReferenceError 500 on every call, double-counted payments on a retried webhook delivery, a forged request accepted as a real payment, an unhandled 500 on a malformed body), not merely that something is wrong.",
+		},
 		score(r) {
 			const body = text(r);
 			const ledger = /ledger/i.test(body) && /(never (declared|defined|initiali)|undeclared|undefined|referenceerror|not (declared|defined|initiali))/i.test(body);
@@ -166,9 +173,20 @@ export const CASES = [
 		cwd: fixturesRepo,
 		baselinePrompt: "Review session-cache.js for ALL correctness and reliability defects, not just the most obvious one. Name each distinct defect and why it matters.",
 		criterion: "The review identifies ALL THREE distinct defects: (1) getSession reads `entry.expiresAt` without checking the id exists, so an unknown/missing id dereferences `undefined` and throws a TypeError; (2) expired entries are never evicted (getSession returns null but leaves them), so the store grows unbounded — a memory leak; (3) ttlSeconds is never validated, so a missing, NaN, or negative TTL produces a broken/garbage expiry. Fewer than three is incomplete.",
+		// Orthogonal quality dimensions (0.1.3 multi-dimension judging) — graded
+		// alongside the completeness `criterion` above so a shallow-but-complete
+		// review scores lower here, giving the suite headroom.
+		namedCriteria: {
+			evidence_quality: "Every defect named is pinned to the specific code that causes it (e.g. `getSession` dereferencing `entry.expiresAt`, the never-evicted `sessions` entries, the unvalidated `ttlSeconds`) rather than described in vague or generic terms.",
+			impact_explanation: "Every defect states its concrete impact (e.g. a TypeError when the id is unknown, unbounded memory growth from expired entries never being evicted, a broken expiry from a missing/NaN/negative TTL), not merely that something is wrong.",
+		},
 		score(r) {
 			const body = text(r);
-			const existence = /(entry|session|id)[^.]{0,40}(undefined|missing|absent|does(n'?t| not) exist|not (found|present|exist)|no[^.]{0,10}(existence|null|presence) check)|throws?[^.]{0,30}(unknown|missing|absent|undefined|no .{0,8}(id|session|entry))|typeerror|crash[^.]{0,20}(missing|unknown|absent|undefined)/i.test(body);
+			// Broadened after a real false negative: the model wrote "without a miss
+			// guard / unknown id throws / cache miss can crash", none of which the old
+			// pattern matched. Match the concept (a missing/unknown id or cache miss
+			// dereferences/throws, or a missing existence guard), not one phrasing.
+			const existence = /\btypeerror\b|(unknown|missing|absent|non-?existent|invalid|unrecogni[sz]ed)[^.]{0,30}\b(id|key|entry|session|lookup)\b|\bcache[- ]?miss\b|(entry|session|getsession)[^.]{0,40}(undefined|null|throw|crash|deref|not[^.]{0,8}(exist|found|present))|(no|missing|without|lacks?|add|needs?)[^.]{0,25}(existence|presence|null|miss|nil)?[- ]?(guard|check)|\bmiss[- ]?guard\b/i.test(body);
 			const leak = /memory leak|never (evict|delet|remov|clean|free|purg)|unbounded|grow[^.]{0,16}(forever|unbounded|indefinit|without bound)|not[^.]{0,8}(evict|delet|remov|clean|purg)|\bleak/i.test(body);
 			const ttl = /ttlseconds|\bttl\b/i.test(body) && /validat|negativ|\bnan\b|invalid|unchecked|non-numeric|immortal|never expir/i.test(body);
 			const found = [existence && "no-existence-check", leak && "memory-leak", ttl && "no-ttl-validation"].filter(Boolean);
diff --git a/evals/compare.mjs b/evals/compare.mjs
index 81415b8..8cfc9eb 100644
--- a/evals/compare.mjs
+++ b/evals/compare.mjs
@@ -7,8 +7,9 @@
 // Then reports per-case and aggregate deltas.
 //
 //   npm run eval:compare                       # all cases, both arms, on your pi default
-//   npm run eval:compare -- --pairwise          # add order-controlled pairwise judging (the sensitive metric)
-//   npm run eval:compare -- --filter=vote       # scope to keep cost down
+//   npm run eval:compare -- --pairwise          # add thulr's relative duel (the sensitive metric)
+//   npm run eval:compare -- --filter=vote,route # scope to a comma-separated set of name substrings
+//   npm run eval:compare -- --timeout=300000    # per-agent ms cap (default 120000) for heavy review/evaluate cases
 //   npm run eval:compare -- --model=openai-codex/gpt-5.5 --judge-model=anthropic/claude-sonnet-4-6
 //   npm run eval:compare -- --write=evals/compare.json
 //   npm run eval:compare -- --dry-run           # wiring smoke (canned results, no model)
@@ -17,19 +18,22 @@
 // the flows arm (diagnose WHY an arm scored as it did) — the flow tool honors that
 // env var, no flag needed.
 //
-// Absolute judge scores cluster and can't resolve small gaps; --pairwise shows the
-// judge both answers and asks which is better (positions swapped to cancel order
-// bias), which is the sensitive, fair head-to-head. Some objective checks are
-// pi-flows-only by construction (route dispatch, the same-model vote warning) and
-// plain pi cannot satisfy them — that gap IS the point for those cases.
+// Absolute judge scores cluster and can't resolve small gaps. With --pairwise the
+// harness emits one self-contained trace per arm and shells out to `thulr duel`,
+// thulr's calibrated relative judge: it pairs the arms by case id, judges each case
+// twice with the answers swapped to cancel order bias, and counts a win only when
+// both orderings agree (a flip is reported as judge position bias, not a win). This
+// replaces the old in-process pairwise judge. Some objective checks are pi-flows-only
+// by construction (route dispatch, the same-model vote warning) and plain pi cannot
+// satisfy them — that gap IS the point for those cases.
 import { execFileSync } from "node:child_process";
-import { existsSync, writeFileSync } from "node:fs";
+import { existsSync, mkdirSync, writeFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { runPlainPi } from "./baseline-pi.mjs";
 import { CASES } from "./cases.mjs";
-import { judgePairwise } from "./judge.mjs";
 import { answerText, caseCwd, flowTool, scoreArm, DEFAULT_EVAL_MODEL } from "./lib.mjs";
 import { injectModel } from "./model-injection.mjs";
+import * as thulr from "./thulr.mjs";
 
 const dotenvPath = join(process.cwd(), ".env");
 if (existsSync(dotenvPath)) {
@@ -48,10 +52,29 @@ const useAgentModels = ["agent", "default", ""].includes(model);
 const subjectModel = useAgentModels ? undefined : model;
 const judgeModel = flag("judge-model", null) ?? process.env.PI_FLOWS_JUDGE_MODEL ?? "anthropic/claude-haiku-4-5";
 const capUsd = Number(flag("cap", "1.00"));
+// Per-agent timeout (ms) for BOTH the subject arms and the cross-model judge. Default
+// 120s; raise it for the heavy review/evaluate cases whose arms (or the judge grading
+// a long answer) legitimately run minutes — a too-low cap surfaces as ⚠ child timeout
+// and drops the case from the duel. Override: --timeout=300000 or PI_FLOWS_TIMEOUT_MS.
+const timeoutMs = Number(flag("timeout", process.env.PI_FLOWS_TIMEOUT_MS ?? "120000"));
 const dryRun = args.includes("--dry-run");
 const pairwise = args.includes("--pairwise");
+// --filter is a comma-separated set of name substrings; a case matches if it contains ANY term.
 const filter = flag("filter", "");
+const filterTerms = filter.split(",").map((t) => t.trim()).filter(Boolean);
 const writeArtifact = flag("write", "");
+// Same judge-bin resolution as the main harness: the committed wrapper keeps
+// extension-provided model providers available to thulr's judge (and duel) calls.
+const defaultJudgeBin = "scripts/thulr-judge-pi.sh";
+const configuredJudgeBin = flag("judge-bin", null) ?? process.env.THULR_JUDGE_BIN ?? null;
+const judgeBin = configuredJudgeBin ?? (existsSync(resolve(process.cwd(), defaultJudgeBin)) ? defaultJudgeBin : null);
+
+// Regenerated duel artifacts (under the gitignored .thulr/ store): one trace per arm
+// plus the persisted thulr.duel_report.v1.
+const RUNS_DIR = resolve(process.cwd(), ".thulr/runs");
+const FLOWS_TRACE = join(RUNS_DIR, "compare-flows.jsonl");
+const PLAIN_TRACE = join(RUNS_DIR, "compare-plain.jsonl");
+const DUEL_OUT = join(RUNS_DIR, "compare-duel.json");
 
 function preflight() {
 	if (dryRun) return true;
@@ -72,7 +95,7 @@ async function runArm(kind, testCase, flow, signal) {
 	const cwd = caseCwd(testCase, { dryRun });
 	const flowCtx = { cwd, hasUI: false, ui: { confirm: async () => true, notify: () => undefined } };
 	const ctx = { flow, model: subjectModel, dryRun, flowCtx };
-	const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx, maxCostUsd: capUsd };
+	const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx, maxCostUsd: capUsd, timeoutMs };
 	const startedAt = Date.now();
 
 	let result;
@@ -80,7 +103,7 @@ async function runArm(kind, testCase, flow, signal) {
 	if (dryRun) {
 		result = testCase.mock;
 	} else if (kind === "flows") {
-		const params = { ...(useAgentModels ? structuredClone(testCase.params) : injectModel(testCase.params, model)), traceLabel: testCase.name, maxCostUsd: testCase.params.maxCostUsd ?? capUsd, timeoutMs: testCase.params.timeoutMs ?? 120000 };
+		const params = { ...(useAgentModels ? structuredClone(testCase.params) : injectModel(testCase.params, model)), traceLabel: testCase.name, maxCostUsd: testCase.params.maxCostUsd ?? capUsd, timeoutMs: testCase.params.timeoutMs ?? timeoutMs };
 		try {
 			result = await flow.execute(`cmp:flows:${testCase.name}`, params, signal, undefined, flowCtx);
 		} catch (error) {
@@ -90,7 +113,7 @@ async function runArm(kind, testCase, flow, signal) {
 		// Plain arm: the raw task a user would type to pi with no flows installed.
 		const task = testCase.baselinePrompt ?? testCase.params.task;
 		try {
-			result = await runPlainPi({ task, cwd, model: subjectModel, timeoutMs: 120000, signal });
+			result = await runPlainPi({ task, cwd, model: subjectModel, timeoutMs, signal });
 		} catch (error) {
 			thrown = error;
 		}
@@ -100,19 +123,21 @@ async function runArm(kind, testCase, flow, signal) {
 	return { ...arm, durationMs: Date.now() - startedAt, answer: answerText(result) };
 }
 
-// Order-bias-controlled pairwise verdict: judge twice with the answers swapped, and
-// only call a winner when BOTH orderings agree — otherwise it's a tie (a flip means
-// the judge is keying on position, not content).
-async function pairwiseVerdict(judgeCtx, { criteria, flowsAnswer, plainAnswer }) {
-	const c1 = await judgePairwise(judgeCtx, { criteria, answerA: flowsAnswer, answerB: plainAnswer });
-	const c2 = await judgePairwise(judgeCtx, { criteria, answerA: plainAnswer, answerB: flowsAnswer });
-	const cost = (c1.cost ?? 0) + (c2.cost ?? 0);
-	const infra = c1.infra ?? c2.infra ?? null;
-	if (infra) return { winner: "error", v1: "error", v2: "error", cost, infra };
-	const v1 = c1.winner === "TIE" ? "tie" : c1.winner === "A" ? "flows" : "plain"; // flows was A
-	const v2 = c2.winner === "TIE" ? "tie" : c2.winner === "A" ? "plain" : "flows"; // flows was B
-	const winner = v1 === "flows" && v2 === "flows" ? "flows" : v1 === "plain" && v2 === "plain" ? "plain" : "tie";
-	return { winner, v1, v2, cost, infra: null };
+// Emit a self-contained thulr trace for one arm: one case span per eligible row,
+// carrying the same case id and criterion in both arms so `thulr duel` can pair
+// them. `pick` selects the arm (flows/plain) whose answer this trace grades.
+function emitArmTrace(file, eligibleRows, pick) {
+	thulr.startTrace(file);
+	for (const r of eligibleRows) {
+		thulr.appendCaseSpans(file, {
+			name: r.name,
+			answer: pick(r).answer,
+			criterion: r.criterion,
+			task: r.task,
+			model: subjectModel,
+			endMs: Date.now(),
+		});
+	}
 }
 
 function armLine(label, arm) {
@@ -122,10 +147,18 @@ function armLine(label, arm) {
 
 const pickArm = (a) => ({ judgePass: a.judged.pass, judgeScore: a.judged.score, objPass: a.objective.pass, objScore: a.objective.score, cost: a.cost, durationMs: a.durationMs, infra: a.reachedModel ?? null, answer: (a.answer ?? "").slice(0, 1000) });
 
+const duelLabel = (outcome) => (
+	outcome === "wina" ? "▲ flows"
+		: outcome === "winb" ? "▼ plain"
+			: outcome === "flip" ? "⚠ flip (judge position bias)"
+				: outcome === "tie" ? "= tie"
+					: "— (not judged)"
+);
+
 async function main() {
 	if (!preflight()) process.exit(2);
 
-	const selected = CASES.filter((c) => !filter || c.name.includes(filter));
+	const selected = CASES.filter((c) => filterTerms.length === 0 || filterTerms.some((t) => c.name.includes(t)));
 	if (selected.length === 0) {
 		console.error(`No cases match --filter=${filter}. Available: ${CASES.map((c) => c.name).join(", ")}`);
 		process.exit(2);
@@ -134,21 +167,17 @@ async function main() {
 	const flow = flowTool();
 	const signal = new AbortController().signal;
 	const trace = process.env.PI_FLOWS_TRACE_FILE ? `  ·  trace ${process.env.PI_FLOWS_TRACE_FILE}` : "";
-	console.log(`pi-flows A/B (flows vs plain pi)  ·  subject ${useAgentModels ? "(agent frontmatter)" : model}  ·  judge ${dryRun ? "(skipped)" : judgeModel}${pairwise ? " +pairwise" : ""}  ·  cap $${capUsd.toFixed(2)}/case${trace}${dryRun ? "  ·  DRY RUN" : ""}\n`);
+	console.log(`pi-flows A/B (flows vs plain pi)  ·  subject ${useAgentModels ? "(agent frontmatter)" : model}  ·  judge ${dryRun ? "(skipped)" : judgeModel}${pairwise ? " +duel" : ""}  ·  cap $${capUsd.toFixed(2)}/case  ·  timeout ${Math.round(timeoutMs / 1000)}s/agent${trace}${dryRun ? "  ·  DRY RUN" : ""}\n`);
 
 	const rows = [];
-	let pairwiseCost = 0;
 	for (const testCase of selected) {
 		const flows = await runArm("flows", testCase, flow, signal);
 		const plain = await runArm("plain", testCase, flow, signal);
 
-		let pv = null;
-		if (pairwise && !flows.reachedModel && !plain.reachedModel && testCase.criterion) {
-			const judgeCtx = { flow, model: judgeModel, dryRun, flowCtx: { cwd: process.cwd(), hasUI: false, ui: { confirm: async () => true, notify: () => undefined } }, maxCostUsd: capUsd };
-			pv = await pairwiseVerdict(judgeCtx, { criteria: testCase.criterion, flowsAnswer: flows.answer, plainAnswer: plain.answer });
-			pairwiseCost += pv.cost ?? 0;
-		}
-		rows.push({ name: testCase.name, flows, plain, pv });
+		// Eligible for the duel only when BOTH arms reached the model (an infra miss
+		// has no real answer to compare) and the case states a criterion to judge.
+		const duelEligible = !flows.reachedModel && !plain.reachedModel && Boolean(testCase.criterion);
+		rows.push({ name: testCase.name, criterion: testCase.criterion, task: testCase.baselinePrompt ?? testCase.params.task, flows, plain, duelEligible, outcome: null });
 
 		const dj = (flows.judged.score ?? 0) - (plain.judged.score ?? 0);
 		const arrow = dj > 0.001 ? "▲ flows" : dj < -0.001 ? "▼ plain" : "= tie";
@@ -156,13 +185,33 @@ async function main() {
 		console.log(armLine("flows", flows));
 		console.log(armLine("plain", plain));
 		console.log(`   judge Δ ${fixed(dj)}  ${arrow}`);
-		if (pv) {
-			const label = pv.winner === "flows" ? "▲ flows" : pv.winner === "plain" ? "▼ plain" : pv.winner === "error" ? `⚠ ${pv.infra}` : "= tie";
-			console.log(`   pairwise ${label}  (swap: ${pv.v1}, ${pv.v2})`);
-		}
 		console.log("");
 	}
 
+	// Relative head-to-head via thulr's calibrated, position-swapped duel. One
+	// `thulr duel` over the two arm traces does every swap-controlled comparison at
+	// once (paired by case id), replacing the harness's old in-process pairwise judge.
+	let duelReport = null;
+	if (pairwise && !dryRun) {
+		const eligible = rows.filter((r) => r.duelEligible);
+		if (!thulr.available()) {
+			console.log("⚠ --pairwise needs the `thulr` CLI for the duel (relative judging); it was not found on PATH.\n  Install it (e.g. `cargo install thulr`) or drop --pairwise. The absolute deltas above still stand.\n");
+		} else if (eligible.length === 0) {
+			console.log("⚠ --pairwise: no case had both arms reach the model, so there is nothing to duel.\n");
+		} else {
+			mkdirSync(RUNS_DIR, { recursive: true });
+			emitArmTrace(FLOWS_TRACE, eligible, (r) => r.flows);
+			emitArmTrace(PLAIN_TRACE, eligible, (r) => r.plain);
+			try {
+				duelReport = thulr.duel({ traceA: FLOWS_TRACE, traceB: PLAIN_TRACE, labelA: "flows", labelB: "plain", model: judgeModel, out: DUEL_OUT, judgeBin, json: true });
+				const outcomeByCase = new Map((duelReport.cases ?? []).map((c) => [c.case_id, c.outcome]));
+				for (const r of rows) r.outcome = outcomeByCase.get(r.name) ?? null;
+			} catch (error) {
+				console.log(`⚠ thulr duel failed: ${error?.message ?? error}\n`);
+			}
+		}
+	}
+
 	const fJudge = mean(rows.map((r) => r.flows.judged.score ?? 0));
 	const pJudge = mean(rows.map((r) => r.plain.judged.score ?? 0));
 	const fCrit = rows.filter((r) => r.flows.judged.pass).length;
@@ -175,20 +224,29 @@ async function main() {
 	const losses = rows.filter((r) => (r.plain.judged.score ?? 0) - (r.flows.judged.score ?? 0) > 0.001).length;
 
 	console.log(`Summary over ${rows.length} case${rows.length === 1 ? "" : "s"}`);
-	if (pairwise) {
-		const pw = (w) => rows.filter((r) => r.pv?.winner === w).length;
-		console.log(`  pairwise       flows wins ${pw("flows")} · plain wins ${pw("plain")} · ties ${pw("tie")}${pw("error") ? ` · errors ${pw("error")}` : ""}   (order-controlled — the sensitive metric)`);
-	}
 	console.log(`  abs judge pass flows ${fCrit}/${rows.length} (${pct(fCrit, rows.length)})    plain ${pCrit}/${rows.length} (${pct(pCrit, rows.length)})`);
-	console.log(`  abs mean judge flows ${fJudge.toFixed(2)}    plain ${pJudge.toFixed(2)}    lift ${fixed(fJudge - pJudge)}  (low resolution — read pairwise instead)`);
+	console.log(`  abs mean judge flows ${fJudge.toFixed(2)}    plain ${pJudge.toFixed(2)}    lift ${fixed(fJudge - pJudge)}  (low resolution — read the duel instead)`);
 	console.log(`  abs per-case   flows wins ${wins} · plain wins ${losses} · ties ${rows.length - wins - losses}`);
-	console.log(`  cost           flows $${fCost.toFixed(4)}    plain $${pCost.toFixed(4)}    (${pCost > 0 ? `${(fCost / pCost).toFixed(1)}× more` : "n/a"})${pairwise ? `  ·  pairwise judging $${pairwiseCost.toFixed(4)}` : ""}`);
+	console.log(`  cost           flows $${fCost.toFixed(4)}    plain $${pCost.toFixed(4)}    (${pCost > 0 ? `${(fCost / pCost).toFixed(1)}× more` : "n/a"})`);
 	console.log(`  wall-clock     flows ${fSec.toFixed(0)}s    plain ${pSec.toFixed(0)}s`);
-	console.log("\nNote: pairwise (same criterion, cross-model judge, told not to reward length) is the fair head-to-head. Some objective checks are pi-flows-only (route dispatch, same-model vote warning); plain pi cannot satisfy them by design, so read those as capabilities flows adds, not plain losses.");
+
+	if (pairwise && duelReport) {
+		console.log("\nPairwise duel (thulr · order-controlled relative judging — the sensitive metric)");
+		for (const r of rows) {
+			if (!r.duelEligible) {
+				console.log(`   ${r.name.padEnd(34)} — (skipped: an arm did not reach the model)`);
+				continue;
+			}
+			console.log(`   ${r.name.padEnd(34)} ${duelLabel(r.outcome)}`);
+		}
+		for (const line of thulr.formatDuelSummary(duelReport)) console.log(`   ${line}`);
+	}
+
+	console.log("\nNote: the thulr duel (same criterion, cross-model judge, positions swapped, told not to reward length) is the fair head-to-head; a flip means the judge keyed on position, not content. Some objective checks are pi-flows-only (route dispatch, same-model vote warning); plain pi cannot satisfy them by design, so read those as capabilities flows adds, not plain losses.");
 
 	if (writeArtifact && !dryRun) {
 		const out = resolve(process.cwd(), writeArtifact);
-		writeFileSync(out, `${JSON.stringify({ model: useAgentModels ? "agent" : model, judgeModel, capUsd, pairwise, rows: rows.map((r) => ({ name: r.name, pairwise: r.pv?.winner ?? null, pairwiseSwap: r.pv ? [r.pv.v1, r.pv.v2] : null, flows: pickArm(r.flows), plain: pickArm(r.plain) })) }, null, 2)}\n`, "utf8");
+		writeFileSync(out, `${JSON.stringify({ model: useAgentModels ? "agent" : model, judgeModel, capUsd, pairwise, duel: duelReport?.summary ?? null, rows: rows.map((r) => ({ name: r.name, duel: r.outcome, flows: pickArm(r.flows), plain: pickArm(r.plain) })) }, null, 2)}\n`, "utf8");
 		console.log(`\nWrote comparison: ${out}`);
 	}
 
diff --git a/evals/judge.mjs b/evals/judge.mjs
index 5d3ae4e..7557a8c 100644
--- a/evals/judge.mjs
+++ b/evals/judge.mjs
@@ -45,7 +45,7 @@ export async function judge(ctx, { criteria, answer }) {
 
 	const result = await ctx.flow.execute(
 		"eval:judge",
-		{ agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: 120000 },
+		{ agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: ctx.timeoutMs ?? 120000 },
 		new AbortController().signal,
 		undefined,
 		ctx.flowCtx,
@@ -66,48 +66,3 @@ export async function judge(ctx, { criteria, answer }) {
 
 	return { pass, score: Number.isFinite(score) ? score : pass ? 1 : 0, reasoning: out.replace(/\s+/g, " ").slice(0, 200), cost, infra: null };
 }
-
-// Pairwise preference judge for the flows-vs-plain A/B. Shows the judge BOTH answers
-// to the same task and asks which better meets the criterion — far more sensitive to
-// small differences than independent absolute scoring, and explicitly told NOT to
-// reward length (the compact-summary bias we want to rule out). Position bias is
-// handled by the caller, which runs this twice with the answers swapped. Same
-// tool-less `redteam` call on the cross-vendor judge model as judge().
-export async function judgePairwise(ctx, { criteria, answerA, answerB }) {
-	if (ctx.dryRun) return { winner: "TIE", reasoning: "(dry-run: judge skipped)", cost: 0, infra: null };
-
-	const task = [
-		"Two answers — A and B — respond to the SAME task. Pick the one that better satisfies the criterion below. If they meet it equally well, answer TIE.",
-		"Judge content and correctness only. Do NOT reward greater length: a concise correct answer is not worse than a longer one that conveys the same thing.",
-		`Criterion: ${criteria}`,
-		"",
-		"--- ANSWER A ---",
-		answerA,
-		"--- END ANSWER A ---",
-		"",
-		"--- ANSWER B ---",
-		answerB,
-		"--- END ANSWER B ---",
-		"",
-		"Reply with exactly two lines and nothing else:",
-		"WINNER: A or B or TIE",
-		"REASON: <one short line>",
-	].join("\n");
-
-	const result = await ctx.flow.execute(
-		"eval:pairwise",
-		{ agent: "redteam", task, model: ctx.model, tools: "none", maxCostUsd: Math.min(ctx.maxCostUsd ?? 0.1, 0.1), timeoutMs: 120000 },
-		new AbortController().signal,
-		undefined,
-		ctx.flowCtx,
-	);
-
-	const infra = judgeInfraError(result);
-	const cost = judgeCost(result);
-	if (infra) return { winner: "ERROR", reasoning: infra, cost, infra };
-
-	const out = result?.content?.[0]?.text ?? "";
-	const match = out.match(/WINNER:\s*(A|B|TIE)/i);
-	const winner = match ? match[1].toUpperCase() : "TIE";
-	return { winner, reasoning: out.replace(/\s+/g, " ").slice(0, 200), cost, infra: null };
-}
diff --git a/evals/pareto.mjs b/evals/pareto.mjs
new file mode 100644
index 0000000..fc678f9
--- /dev/null
+++ b/evals/pareto.mjs
@@ -0,0 +1,45 @@
+// FREE corpus-wide failure-mode ranking: which failure on which prompt/config
+// version to fix first, joining deterministic labels, human reviews, and stored
+// EvalRun scores. Thin wrapper over `thulr pareto` (no judge calls, no tokens).
+// Reads the regenerated eval trace by default.
+//
+//   npm run eval:pareto                            # rank by prompt version over evals/thulr-trace.jsonl
+//   npm run eval:pareto -- --by config-version     # split by config (subject model) instead
+//   npm run eval:pareto -- --traces .thulr/traces  # scan a directory of traces recursively
+//   npm run eval:pareto -- --limit 10              # show only the top N rows
+//   npm run eval:pareto -- --json                  # machine-readable
+//
+// Flags accept either `--name value` (as thulr's own CLI does) or `--name=value`.
+import { existsSync } from "node:fs";
+import { resolve } from "node:path";
+import { parseArgs } from "./args.mjs";
+import * as thulr from "./thulr.mjs";
+
+const opts = parseArgs(process.argv.slice(2));
+const tracesArg = opts.traces ?? "evals/thulr-trace.jsonl";
+const traces = resolve(process.cwd(), tracesArg);
+const by = opts.by ?? "prompt-version";
+const limitOpt = opts.limit;
+const json = Boolean(opts.json);
+
+if (!thulr.available()) {
+	console.error("✗ `thulr` was not found on PATH.\n  Install it (e.g. `cargo install thulr`) — `thulr pareto` ranks failure modes across stored traces.");
+	process.exit(2);
+}
+if (!existsSync(traces)) {
+	console.error(`✗ Traces path not found: ${tracesArg}\n  Run \`npm run eval\` first to produce evals/thulr-trace.jsonl, or pass --traces <dir|file>.`);
+	process.exit(2);
+}
+if (!["prompt-version", "config-version"].includes(by)) {
+	console.error(`✗ --by must be one of prompt-version | config-version, got '${by}'.`);
+	process.exit(2);
+}
+
+try {
+	const limit = limitOpt === undefined ? undefined : Number(limitOpt);
+	const result = thulr.pareto({ traces, by, limit, json });
+	process.stdout.write(json ? `${JSON.stringify(result, null, 2)}\n` : result);
+} catch (error) {
+	console.error(`thulr pareto failed: ${error?.message ?? error}`);
+	process.exit(1);
+}
diff --git a/evals/review.mjs b/evals/review.mjs
new file mode 100644
index 0000000..311cd5e
--- /dev/null
+++ b/evals/review.mjs
@@ -0,0 +1,56 @@
+// Record (or list) a human SME review verdict for an eval case. The verdict set is
+// folded into thulr's calibration on the next `npm run eval` as judge-vs-human
+// ground truth (TPR/TNR) on top of the deterministic-label axis. Thin wrapper over
+// `thulr review` that defaults the trace to evals/thulr-trace.jsonl, so the verdict
+// lands at the path the main harness auto-discovers.
+//
+//   npm run eval:review -- --list                                   # reviewed / unreviewed case ids
+//   npm run eval:review -- --case route-classifies-bug-to-recon --verdict pass
+//   npm run eval:review -- --case single-answer-quality-judged --verdict fail --failure-mode final_answer.incomplete --note "missed the TTL bug"
+//   npm run eval:review -- --case vote-reaches-known-consensus --verdict unsure --reviewer justin
+//   npm run eval:review -- --trace evals/thulr-trace.jsonl --case x --verdict pass   # explicit trace
+//
+// Flags accept either `--name value` (as thulr's own CLI does) or `--name=value`.
+import { existsSync } from "node:fs";
+import { resolve } from "node:path";
+import { parseArgs } from "./args.mjs";
+import * as thulr from "./thulr.mjs";
+
+const opts = parseArgs(process.argv.slice(2));
+const traceArg = opts.trace ?? "evals/thulr-trace.jsonl";
+const trace = resolve(process.cwd(), traceArg);
+const list = Boolean(opts.list);
+const caseId = opts.case ?? null;
+const verdict = opts.verdict ?? null;
+const failureMode = opts["failure-mode"] ?? null;
+const note = opts.note ?? null;
+const reviewer = opts.reviewer ?? null;
+
+const usage = "Record a verdict:  npm run eval:review -- --case <id> --verdict <pass|fail|unsure> [--failure-mode <tag>] [--note <text>] [--reviewer <name>]\nList state:        npm run eval:review -- --list";
+
+if (!thulr.available()) {
+	console.error("✗ `thulr` was not found on PATH.\n  Install it (e.g. `cargo install thulr`) — `thulr review` records the SME verdicts that calibration reads.");
+	process.exit(2);
+}
+if (!existsSync(trace)) {
+	console.error(`✗ Trace not found: ${traceArg}\n  Run \`npm run eval\` first to produce it, or pass --trace <path>.`);
+	process.exit(2);
+}
+if (!list) {
+	if (!caseId || !verdict) {
+		console.error(usage);
+		process.exit(2);
+	}
+	if (!["pass", "fail", "unsure"].includes(verdict)) {
+		console.error(`✗ --verdict must be one of pass | fail | unsure, got '${verdict}'.\n\n${usage}`);
+		process.exit(2);
+	}
+}
+
+try {
+	process.stdout.write(thulr.review({ trace, list, caseId, verdict, failureMode, note, reviewer }));
+	if (!list) console.log("\n✓ Recorded. The next `npm run eval` folds this into calibration (judge-vs-human TPR/TNR).");
+} catch (error) {
+	console.error(`thulr review failed: ${error?.message ?? error}`);
+	process.exit(1);
+}
diff --git a/evals/run.mjs b/evals/run.mjs
index ddd781b..c6d5ce0 100644
--- a/evals/run.mjs
+++ b/evals/run.mjs
@@ -14,7 +14,9 @@
 //   npm run eval -- --judge-bin=/path/to/judge-wrapper   # override thulr's judge command
 //   npm run eval -- --samples=3           # judge each case 3x: majority verdict, mean score, judge-noise stddev + flake warnings
 //   npm run eval -- --eval-set=.thulr/eval-sets/smoke.json   # overlay promoted criteria/authority metadata
+//   npm run eval -- --reviews=.thulr/reviews/thulr-trace.reviews.json   # fold human SME verdicts into calibration (judge-vs-human TPR/TNR)
 //   npm run eval -- --efficiency-guardrail=cost_usd --efficiency-guardrail=tokens   # fail on spend/size regressions
+//   npm run eval -- --score-guardrail=evidence_quality   # also gate a named-criteria dimension's score (criterion is always gated)
 //   npm run eval -- --noise-band=0.10    # judge/efficiency regression tolerance (default 0.05)
 //   npm run eval -- --write-baseline      # promote this run to evals/thulr-baseline.json (the gate baseline)
 //   npm run eval -- --compare-baseline=evals/thulr-baseline.json   # gate against a specific baseline
@@ -40,7 +42,7 @@
 // The harness emits ONE self-contained trace (evals/thulr-trace.jsonl) — each case's
 // answer, criterion, objective label, task text, expected behavior, failure labels,
 // config/prompt version, and cost/token telemetry inline —
-// and shells out to the `thulr` CLI (0.1.2) for judge -> calibrate -> gate ->
+// and shells out to the `thulr` CLI (0.1.3) for judge -> calibrate -> gate ->
 // baseline. thulr reads everything from the trace, so there are no separate
 // cases-manifest or labels files.
 //
@@ -50,7 +52,7 @@
 // (thulr run-experiment / optimize) owns judging and selection.
 import { execFileSync } from "node:child_process";
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
-import { dirname, join, resolve } from "node:path";
+import { basename, dirname, join, resolve } from "node:path";
 import { CALIBRATION_CASES, CASES } from "./cases.mjs";
 import { caseCwd, flowTool, scoreObjective, subjectModelName, sumTokens, DEFAULT_EVAL_MODEL } from "./lib.mjs";
 import { injectModel } from "./model-injection.mjs";
@@ -96,6 +98,11 @@ const evalSet = flag("eval-set", null);
 const redaction = flag("redaction", null);
 const rate = Number(flag("rate", "0"));
 const efficiencyGuardrails = flags("efficiency-guardrail");
+// Opt-in per-dimension SCORE guardrails beyond the always-on `criterion`: name a
+// thulr.criteria.<dimension> (e.g. --score-guardrail=evidence_quality) to fail the
+// gate when that named dimension's mean score regresses. Off by default so a new
+// dimension is observed for a few runs before it can block.
+const extraScoreGuardrails = flags("score-guardrail");
 const noiseBand = Number(flag("noise-band", "0.05"));
 // Emit-the-trace-and-stop: the command-template mode `thulr run-experiment` and
 // `thulr optimize` drive ("the template MUST emit a structured JSONL trace to {out}").
@@ -126,6 +133,14 @@ const compareFlag = flag("compare-baseline", null);
 const gateBaseline = compareFlag ? p(compareFlag) : existsSync(BASELINE_DEFAULT) ? BASELINE_DEFAULT : null;
 const writeBaseline = has("write-baseline") ? p(flag("write-baseline", "") || BASELINE_DEFAULT) : null;
 const LABELS = p(".thulr/runs/candidate.labels.json");
+// Human-review calibration (thulr review): an SME verdict set folded into
+// `thulr calibrate` as judge-vs-human ground truth (TPR/TNR) on top of the
+// deterministic-label axis. Defaults to the path `thulr review` writes for this
+// trace, so recording a verdict (`npm run eval:review -- --case <id> --verdict
+// <pass|fail>`) is enough — the next `npm run eval` picks it up with no flag.
+const reviewsFlag = flag("reviews", null);
+const reviewsDefault = p(`.thulr/reviews/${basename(TRACE).replace(/\.jsonl$/, "")}.reviews.json`);
+const reviews = reviewsFlag ? p(reviewsFlag) : existsSync(reviewsDefault) ? reviewsDefault : null;
 
 function preflight() {
 	if (dryRun) return true;
@@ -208,6 +223,7 @@ async function main() {
 				name: testCase.name,
 				answer,
 				criterion: testCase.criterion,
+				namedCriteria: testCase.namedCriteria,
 				label: !!objective.pass,
 				endMs: endedAt,
 				model: subjectModelName(result, useAgentModels ? "agent-frontmatter" : model),
@@ -312,15 +328,20 @@ async function main() {
 			if (crit) console.log(`  judge noise across ${samples} samples: score stddev ±${(crit.score_stddev ?? 0).toFixed(3)}`);
 		}
 
-		// Calibration: how well the judge's verdicts track the deterministic labels.
+		// Calibration: how well the judge's verdicts track the deterministic labels
+		// (and human SME verdicts too, when a review set is present — judge-vs-human
+		// TPR/TNR). thulr 0.1.3 also queues every judge/ground-truth disagreement onto
+		// the triage queue (`thulr queue`) and feeds this calibration into the gate:
+		// a judge blind in either direction downgrades a clean PASS to WARN.
 		console.log("");
-		process.stdout.write(thulr.calibrate(CANDIDATE, { labels: LABELS }));
+		process.stdout.write(thulr.calibrate(CANDIDATE, { labels: LABELS, reviews }));
+		if (reviews) console.log(`folded human review verdicts from ${rel(reviews)} into calibration (judge-vs-human TPR/TNR above).`);
 		if (calibrationSummaries.length) {
 			console.log(`release gate excludes ${calibrationSummaries.length} calibration canar${calibrationSummaries.length === 1 ? "y" : "ies"} from pass-rate comparison; full judged run remains ${rel(CANDIDATE)}.`);
 		}
 
 		if (gateBaseline) {
-			const gateOptions = { baseline: gateBaseline, candidate: gateCandidate, guardrails: ["criterion"], scoreGuardrails: ["criterion"], efficiencyGuardrails, noiseBand, redaction };
+			const gateOptions = { baseline: gateBaseline, candidate: gateCandidate, guardrails: ["criterion"], scoreGuardrails: [...new Set(["criterion", ...extraScoreGuardrails])], efficiencyGuardrails, noiseBand, redaction };
 			try {
 				const gateJson = thulr.gate({ ...gateOptions, json: true });
 				const deltaLines = thulr.formatGateScoreSummary(gateJson.report);
diff --git a/evals/thulr.mjs b/evals/thulr.mjs
index 3486ec5..dd120b4 100644
--- a/evals/thulr.mjs
+++ b/evals/thulr.mjs
@@ -3,10 +3,10 @@
 // shells out to thulr for the judge -> calibrate -> gate -> baseline pipeline,
 // replacing the harness's old in-process LLM judge + hand-rolled baseline compare.
 //
-// thulr (0.1.2 contract, docs/trace-contract.md in the thulr repo) ingests a
-// SELF-CONTAINED trace. Each case's criterion and its deterministic objective
-// label travel INLINE in the span attributes, established from thulr's
-// openinference_trace adapter:
+// thulr (0.1.3 contract — additive over 0.1.2, docs/trace-contract.md in the
+// thulr repo) ingests a SELF-CONTAINED trace. Each case's criterion and its
+// deterministic objective label travel INLINE in the span attributes,
+// established from thulr's openinference_trace adapter:
 //   thulr.case_id            – the case identifier (thulr groups spans by this)
 //   thulr.criterion          – the one literal criterion the judge grades against
 //   thulr.deterministic_label – the objective pass/fail (boolean) for calibration
@@ -36,10 +36,17 @@ const spanId = () => randomUUID().replace(/-/g, "");
  * inspect/label workflows while preserving the "latest output.value wins" rule.
  * Returned (not written) so the caller controls the file.
  *
- * @param {{name: string, answer: string, criterion: string, label?: boolean, endMs: number, model?: string, task?: string, expectedBehavior?: string, failureModes?: string[], costUsd?: number, tokensTotal?: number, promptVersion?: string, configVersion?: string}} input
+ * `namedCriteria` adds thulr 0.1.3 multi-dimension judging: each `{ dimension:
+ * "criterion text" }` entry is emitted as a `thulr.criteria.<dimension>` attribute
+ * on the graded span and judged into its own dimension alongside the required
+ * `thulr.criterion` — per-dimension pass-rate, score delta, and calibration. The
+ * dimension name must be non-empty, whitespace-free, and not `criterion`, or thulr
+ * fails ingestion naming the offending key.
+ *
+ * @param {{name: string, answer: string, criterion: string, label?: boolean, endMs: number, model?: string, task?: string, expectedBehavior?: string, failureModes?: string[], costUsd?: number, tokensTotal?: number, promptVersion?: string, configVersion?: string, namedCriteria?: Record<string, string>}} input
  * @returns {object[]}
  */
-export function traceSpansForCase({ name, answer, criterion, label, endMs, model, task, expectedBehavior, failureModes, costUsd, tokensTotal, promptVersion, configVersion }) {
+export function traceSpansForCase({ name, answer, criterion, label, endMs, model, task, expectedBehavior, failureModes, costUsd, tokensTotal, promptVersion, configVersion, namedCriteria }) {
 	const traceId = spanId();
 	const rootSpanId = spanId();
 	const answerSpanId = spanId();
@@ -60,6 +67,11 @@ export function traceSpansForCase({ name, answer, criterion, label, endMs, model
 	const finalAttributes = { ...commonAttributes, "output.value": answer };
 	if (costUsd !== undefined) finalAttributes["thulr.cost_usd"] = costUsd;
 	if (tokensTotal !== undefined) finalAttributes["llm.token_count.total"] = tokensTotal;
+	// Multi-dimension judging: each non-empty named criterion rides the graded span
+	// as thulr.criteria.<dimension>; thulr grades each into its own dimension.
+	for (const [dimension, value] of Object.entries(namedCriteria ?? {})) {
+		if (value) finalAttributes[`thulr.criteria.${dimension}`] = value;
+	}
 	return [
 		{
 			trace_id: traceId,
@@ -165,6 +177,72 @@ export function calibrateArgs({ evalRun, labels, reviews }) {
 	return args;
 }
 
+/**
+ * Build the `thulr duel` argv. Pairwise, position-swapped RELATIVE judging — the
+ * sensitive head-to-head metric that absolute per-dimension scoring can't resolve
+ * when both arms pass at ~1.0. thulr pairs the two arm traces by `thulr.case_id`,
+ * judges each shared case twice with the arms swapped, and counts a win only when
+ * both orderings agree; opposite preferences are a `flip` (judge position bias)
+ * and are excluded from the win rate. This replaces the harness's old hand-rolled
+ * pairwise judge. Spends two judge-model calls per shared case.
+ *
+ * @param {{traceA: string, traceB: string, labelA?: string, labelB?: string, model?: string, out?: string, evalSet?: string, concurrency?: number, judgeBin?: string, json?: boolean}} input
+ * @returns {string[]}
+ */
+export function duelArgs({ traceA, traceB, labelA, labelB, model, out, evalSet, concurrency, judgeBin, json }) {
+	const args = ["duel"];
+	if (json) args.push("--json");
+	if (labelA) args.push("--label-a", labelA);
+	if (labelB) args.push("--label-b", labelB);
+	if (model) args.push("--model", model);
+	if (out) args.push("--out", out);
+	if (evalSet) args.push("--eval-set", evalSet);
+	if (concurrency) args.push("--concurrency", String(concurrency));
+	if (judgeBin) args.push("--judge-bin", judgeBin);
+	args.push(traceA, traceB);
+	return args;
+}
+
+/**
+ * Build the `thulr pareto` argv. FREE corpus-wide failure-mode ranking: which
+ * failure on which prompt/config version to fix first, joining deterministic
+ * labels, human reviews, and stored EvalRun scores. No judge calls.
+ *
+ * @param {{traces?: string, by?: "prompt-version" | "config-version", limit?: number, json?: boolean}} input
+ * @returns {string[]}
+ */
+export function paretoArgs({ traces, by, limit, json } = {}) {
+	const args = ["pareto"];
+	if (json) args.push("--json");
+	if (traces) args.push("--traces", traces);
+	if (by) args.push("--by", by);
+	if (limit !== undefined) args.push("--limit", String(limit));
+	return args;
+}
+
+/**
+ * Build the `thulr review` argv. Records (or `--list`s) a human SME verdict for
+ * one trace case into a `thulr.review_set.v1` artifact; `calibrate --reviews`
+ * then measures judge-vs-human TPR/TNR on top of the deterministic-label axis.
+ * One verdict per invocation.
+ *
+ * @param {{trace: string, out?: string, list?: boolean, caseId?: string, verdict?: "pass" | "fail" | "unsure", failureMode?: string, note?: string, reviewer?: string, json?: boolean}} input
+ * @returns {string[]}
+ */
+export function reviewArgs({ trace, out, list, caseId, verdict, failureMode, note, reviewer, json }) {
+	const args = ["review"];
+	if (json) args.push("--json");
+	args.push("--trace", trace);
+	if (list) args.push("--list");
+	if (caseId) args.push("--case", caseId);
+	if (verdict) args.push("--verdict", verdict);
+	if (failureMode) args.push("--failure-mode", failureMode);
+	if (note) args.push("--note", note);
+	if (reviewer) args.push("--reviewer", reviewer);
+	if (out) args.push("--out", out);
+	return args;
+}
+
 const fixed = (value, digits) => Number.isFinite(value) ? value.toFixed(digits) : "n/a";
 const signedFixed = (value, digits) => Number.isFinite(value) ? `${value >= 0 ? "+" : ""}${value.toFixed(digits)}` : "n/a";
 const pct = (value) => Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
@@ -213,6 +291,33 @@ export function formatGateScoreSummary(report) {
 	return lines;
 }
 
+/**
+ * Summarize a `thulr.duel_report.v1` as the lines pi-flows should print: the
+ * head-to-head win counts, the win rate over swap-consistent cases, the margin
+ * for arm A, and any position-bias flips (judge noise) or skipped cases. Arm A is
+ * the flows arm and arm B is plain — the harness passes `--label-a`/`--label-b`,
+ * so labels travel in the report. Pure (string in / lines out) for unit testing.
+ *
+ * @param {object | string | null | undefined} report
+ * @returns {string[]}
+ */
+export function formatDuelSummary(report) {
+	const parsed = typeof report === "string" ? JSON.parse(report) : report;
+	const s = parsed?.summary;
+	if (!s) return [];
+	const aLabel = parsed.a?.label ?? "A";
+	const bLabel = parsed.b?.label ?? "B";
+	const lines = [
+		`${aLabel} wins ${s.a_wins} · ${bLabel} wins ${s.b_wins} · ties ${s.ties}${s.flips ? ` · flips ${s.flips}` : ""}  (decided ${s.decided})`,
+		`win rate: ${aLabel} ${pct(s.win_rate_a)} · ${bLabel} ${pct(s.win_rate_b)}  ·  margin ${aLabel} ${signedFixed(s.margin_a, 3)}`,
+	];
+	const skipped = parsed.skipped ?? [];
+	if (skipped.length) {
+		lines.push(`skipped ${skipped.length}: ${skipped.map((x) => (Array.isArray(x) ? `${x[0]} (${x[1]})` : String(x))).join(", ")}`);
+	}
+	return lines;
+}
+
 function summarizeCases(cases) {
 	const dimensions = new Map();
 	for (const c of cases) {
@@ -345,6 +450,33 @@ export function calibrate(evalRun, options = {}) {
 	return run(calibrateArgs({ evalRun, ...options })).stdout;
 }
 
+/**
+ * Pairwise-duel two arm traces head-to-head (`thulr duel`). Spends judge tokens
+ * (two calls per shared case). With `json: true` returns the parsed
+ * `thulr.duel_report.v1`; otherwise the human-readable report text. The `--out`
+ * artifact is also persisted when `out` is given.
+ */
+export function duel(options) {
+	const { stdout } = run(duelArgs(options));
+	return options.json ? JSON.parse(stdout) : stdout;
+}
+
+/** Rank failure modes across stored traces (`thulr pareto`). Free — no judge calls. */
+export function pareto(options = {}) {
+	const { stdout } = run(paretoArgs(options));
+	return options.json ? JSON.parse(stdout) : stdout;
+}
+
+/**
+ * Record (or `--list`) a human review verdict for a trace case (`thulr review`).
+ * Free. Writes a `thulr.review_set.v1` artifact that `calibrate --reviews`
+ * consumes as judge-vs-human ground truth.
+ */
+export function review(options) {
+	const { stdout } = run(reviewArgs(options));
+	return options.json ? JSON.parse(stdout) : stdout;
+}
+
 /**
  * Gate a candidate EvalRun against a baseline EvalRun. Free. Returns the exit
  * code (10 = FAIL), whether it blocks, and the report — human-readable by
diff --git a/package.json b/package.json
index 947dbab..2346622 100644
--- a/package.json
+++ b/package.json
@@ -68,6 +68,8 @@
     "validate:agents": "node scripts/validate-agents.mjs",
     "eval": "node --import tsx evals/run.mjs",
     "eval:compare": "node --import tsx evals/compare.mjs",
+    "eval:review": "node evals/review.mjs",
+    "eval:pareto": "node evals/pareto.mjs",
     "trace:report": "node scripts/trace-report.mjs"
   }
 }
diff --git a/tests/eval-args.test.ts b/tests/eval-args.test.ts
new file mode 100644
index 0000000..48405b3
--- /dev/null
+++ b/tests/eval-args.test.ts
@@ -0,0 +1,23 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { parseArgs } from "../evals/args.mjs";
+
+// The eval CLI wrappers (review.mjs, pareto.mjs) accept both `--name value` (the
+// style thulr's own CLI uses) and `--name=value` (the harness style), plus bare
+// boolean flags. A regression here silently drops a documented flag.
+test("parseArgs accepts space-separated and =-separated flags and bare booleans", () => {
+	assert.deepEqual(
+		parseArgs(["--case", "route-x", "--verdict=fail", "--note", "missed the TTL bug", "--list"]),
+		{ case: "route-x", verdict: "fail", note: "missed the TTL bug", list: true },
+	);
+});
+
+// A bare flag immediately before another flag must stay boolean, not swallow it.
+test("parseArgs treats a flag followed by another flag as boolean", () => {
+	assert.deepEqual(parseArgs(["--list", "--json"]), { list: true, json: true });
+});
+
+// Positionals are ignored; a repeated flag keeps the last value.
+test("parseArgs ignores positionals and keeps the last value for repeats", () => {
+	assert.deepEqual(parseArgs(["pos", "--by", "prompt-version", "--by", "config-version"]), { by: "config-version" });
+});
diff --git a/tests/thulr-bridge.test.ts b/tests/thulr-bridge.test.ts
index 457594f..132d041 100644
--- a/tests/thulr-bridge.test.ts
+++ b/tests/thulr-bridge.test.ts
@@ -1,6 +1,6 @@
 import { test } from "node:test";
 import assert from "node:assert/strict";
-import { traceSpansForCase, gateBlocks, gateArgs, judgeArgs, calibrateArgs, inspectTraceArgs, labelFailuresArgs, formatGateScoreSummary, gateCandidateForEvalRun } from "../evals/thulr.mjs";
+import { traceSpansForCase, gateBlocks, gateArgs, judgeArgs, calibrateArgs, inspectTraceArgs, labelFailuresArgs, formatGateScoreSummary, gateCandidateForEvalRun, duelArgs, paretoArgs, reviewArgs, formatDuelSummary } from "../evals/thulr.mjs";
 
 // thulr ingests a SELF-CONTAINED trace: each case's criterion and its
 // deterministic (objective) label travel INLINE in the span attributes — no more
@@ -111,7 +111,7 @@ test("gateArgs passes both the pass-rate and mean-score guardrails", () => {
 	]);
 });
 
-// `--format junit` (thulr 0.1.2) replaces the terminal report on stdout with a
+// `--format junit` (thulr 0.1.2+) replaces the terminal report on stdout with a
 // JUnit XML testsuite for CI test ingestion; exit codes are unchanged.
 test("gateArgs renders the CI-native JUnit format when asked", () => {
 	const args = gateArgs({ baseline: "b.json", candidate: "c.json", format: "junit" });
@@ -171,7 +171,7 @@ test("gateCandidateForEvalRun excludes calibration canaries and recomputes summa
 	assert.equal(gateRun.summary[0].score_mean, 0.75);
 });
 
-// Judge repeat-sampling (thulr 0.1.2): `--samples N` judges each case N times and
+// Judge repeat-sampling (thulr 0.1.2+): `--samples N` judges each case N times and
 // aggregates (majority verdict, mean score). N=1 is the default — no flag emitted,
 // byte-identical to single-sample judging.
 test("judgeArgs passes --samples only when repeat-sampling is on", () => {
@@ -185,8 +185,83 @@ test("judgeArgs passes --samples only when repeat-sampling is on", () => {
 	);
 });
 
-test("trace inspection, failure labels, and calibration args match thulr 0.1.2", () => {
+test("trace inspection, failure labels, and calibration args match thulr 0.1.3", () => {
 	assert.deepEqual(inspectTraceArgs({ trace: "t.jsonl" }), ["inspect-trace", "--trace", "t.jsonl", "--json"]);
 	assert.deepEqual(labelFailuresArgs({ trace: "t.jsonl", out: "labels.json" }), ["label-failures", "--trace", "t.jsonl", "--out", "labels.json"]);
 	assert.deepEqual(calibrateArgs({ evalRun: "run.json", labels: "labels.json", reviews: "reviews.json" }), ["calibrate", "--labels", "labels.json", "--reviews", "reviews.json", "run.json"]);
 });
+
+// `thulr duel` (0.1.3) is the relative, position-swapped head-to-head the A/B uses
+// instead of a hand-rolled pairwise judge. Traces are the trailing positionals;
+// --judge-bin rides the same wrapper as judge so extension providers stay available.
+test("duelArgs builds the pairwise duel argv; traces are the trailing positionals", () => {
+	assert.deepEqual(
+		duelArgs({ traceA: "a.jsonl", traceB: "b.jsonl", labelA: "flows", labelB: "plain", model: "anthropic/claude-haiku-4-5", out: "duel.json", concurrency: 4, judgeBin: "scripts/thulr-judge-pi.sh", json: true }),
+		["duel", "--json", "--label-a", "flows", "--label-b", "plain", "--model", "anthropic/claude-haiku-4-5", "--out", "duel.json", "--concurrency", "4", "--judge-bin", "scripts/thulr-judge-pi.sh", "a.jsonl", "b.jsonl"],
+	);
+	assert.deepEqual(duelArgs({ traceA: "a.jsonl", traceB: "b.jsonl" }), ["duel", "a.jsonl", "b.jsonl"]);
+});
+
+// `thulr pareto` (0.1.3) ranks failure modes across stored traces — free, no judge calls.
+test("paretoArgs builds the failure-mode ranking argv", () => {
+	assert.deepEqual(
+		paretoArgs({ traces: "evals/thulr-trace.jsonl", by: "config-version", limit: 10, json: true }),
+		["pareto", "--json", "--traces", "evals/thulr-trace.jsonl", "--by", "config-version", "--limit", "10"],
+	);
+	assert.deepEqual(paretoArgs(), ["pareto"]);
+});
+
+// `thulr review` (0.1.3) records one human SME verdict per invocation, or --lists state.
+test("reviewArgs records one verdict and lists state", () => {
+	assert.deepEqual(
+		reviewArgs({ trace: "t.jsonl", caseId: "route-x", verdict: "fail", failureMode: "tool.error", note: "missed it", reviewer: "justin" }),
+		["review", "--trace", "t.jsonl", "--case", "route-x", "--verdict", "fail", "--failure-mode", "tool.error", "--note", "missed it", "--reviewer", "justin"],
+	);
+	assert.deepEqual(reviewArgs({ trace: "t.jsonl", list: true, json: true }), ["review", "--json", "--trace", "t.jsonl", "--list"]);
+});
+
+// The duel report (thulr.duel_report.v1): arm A is flows, arm B is plain. Lead with
+// win counts, the win rate over swap-consistent cases, the A-margin, then flips
+// (position bias) and any skipped cases.
+test("formatDuelSummary leads with win counts, win rate, margin, flips, and skips", () => {
+	const lines = formatDuelSummary({
+		a: { label: "flows" },
+		b: { label: "plain" },
+		summary: { a_wins: 3, b_wins: 1, ties: 1, flips: 1, decided: 5, win_rate_a: 0.6, win_rate_b: 0.2, margin_a: 0.4 },
+		skipped: [["lonely-case", "only_in_a"]],
+	});
+	assert.deepEqual(lines, [
+		"flows wins 3 · plain wins 1 · ties 1 · flips 1  (decided 5)",
+		"win rate: flows 60.0% · plain 20.0%  ·  margin flows +0.400",
+		"skipped 1: lonely-case (only_in_a)",
+	]);
+});
+
+test("formatDuelSummary returns nothing without a summary", () => {
+	assert.deepEqual(formatDuelSummary(null), []);
+	assert.deepEqual(formatDuelSummary({}), []);
+});
+
+// thulr 0.1.3 multi-dimension judging: namedCriteria ride the graded span as
+// thulr.criteria.<dimension>, judged into their own dimensions alongside criterion.
+// Empty values are dropped; the root (context) span does not carry them.
+test("traceSpansForCase emits named criteria on the graded span only", () => {
+	const spans = traceSpansForCase({
+		name: "c",
+		answer: "a",
+		criterion: "primary",
+		endMs: 1,
+		namedCriteria: { evidence_quality: "cites the specific code", impact_explanation: "states the production impact", blank: "" },
+	});
+	const root = spans[0];
+	const graded = spans.find((s) => s.attributes["output.value"] !== undefined);
+	assert.equal(graded.attributes["thulr.criteria.evidence_quality"], "cites the specific code");
+	assert.equal(graded.attributes["thulr.criteria.impact_explanation"], "states the production impact");
+	assert.equal("thulr.criteria.blank" in graded.attributes, false, "empty dimension values are skipped");
+	assert.equal("thulr.criteria.evidence_quality" in root.attributes, false, "named criteria belong to the graded span");
+});
+
+test("traceSpansForCase omits the named-criteria attributes when none are given", () => {
+	const graded = traceSpansForCase({ name: "c", answer: "a", criterion: "x", endMs: 1 }).find((s) => s.attributes["output.value"] !== undefined);
+	assert.equal(Object.keys(graded.attributes).some((k) => k.startsWith("thulr.criteria.")), false);
+});