diff --git a/.gitignore b/.gitignore
index 8c6cf07..5735539 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,14 @@ tests/eval-results.json
 # covered by *.xlsx above; this also excludes the parsed engine output dirs.
 engines/
 
+# Benchmark run detail (per-sheet/per-cell results derived from the real models
+# — may contain real values/labels). Only the aggregate-only benchmarks/BASELINE.md
+# is committed; raw per-run detail stays local.
+benchmarks/results/
+
+# per-sheet-eval scratch dir (transient child-process scripts + per-sheet GT)
+_eval_tmp/
+
 # Transient test artifacts (scenario save/load test writes here on every run)
 tests/cli/fixtures/scenarios/
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0bffafc..8d3b13b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,168 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Privacy scrub: genericize the real model name + figures
+
+This repo is public; CLAUDE.md forbids committing real financials or participant
+names. Two cleanups before merging the next-wave PR:
+
+- **Removed the real return figures** (gross/net MOIC & IRR, the UW-comparison
+  multiple, the MIP dollar amount) from the committed docs. The findings stay
+  (golden-master match on Version Tracker row 22; refiner UW-Comparison mis-map;
+  MIP is a hand-port calibration) — only the numbers are gone. Canonical values
+  live in the gitignored artifacts + local notes and feed the golden-master test
+  from there.
+- **Genericized the real model name** out of all committed files: renamed
+  `benchmarks/outpost-bench.mjs` → `benchmarks/bench.mjs` (npm script `bench`),
+  and the benchmark now **anonymizes model identity** in printed + committed
+  output (Model A, Model B, …) — real dir names stay only in the gitignored
+  detail JSON. Prose in HANDOFF/ROADMAP/PLAN/CHANGELOG now says "the real PE
+  models" / "Model A/B". (The `test-e2e4-fixes` scrub-guard that asserts template
+  names are generic is intentionally kept.)
+
+## 2026-05-28 — Mippy calibration-oracle feature set (priority amendment)
+
+Refined the "fully ready for Mippy" target: the e2e agent's job is to make the
+full model a **reliable calibration oracle** — runnable, MIP coefficients exposed
+as named-outputs, no stubbed value cells. Documented the priority order in
+ROADMAP ("Now — Mippy calibration oracle") and HANDOFF.md, and in the
+`project_mippy_contract` memory:
+
+- **P1 · #23 + #24** — reliably emit a runnable `engine.js` (fix dep-graph OOM;
+  fail loud, never a partial artifact; lock layout + content hash).
+- **P2 · #25** — pin value-bearing cells (per-class MIP Proceeds, hurdle,
+  participation %, equity basis, valuation/shares) as named-outputs.
+- **P2 · #26** — emit `_fn-fallbacks.json`; assert no value cell uses an
+  unsupported-function stub.
+- **P3 · #22** — output-cone scoping (nice-to-have).
+
+Supporting/trustworthiness (off critical path): golden-master CI, refiner
+UW-Comparison fix, deeper `_fn` coverage, cluster-once eval.
+
+## 2026-05-28 — HANDOFF.md (fresh-agent entry point)
+
+Added `HANDOFF.md` — the prioritized next-session plan (P0 cluster-once eval →
+generation robustness #23 → `_fn()` transpiler coverage → refiner UW-Comparison
+fix → golden-master CI → output-profile/large-sheet/perf → Polish), with current
+state, run commands, and the gotchas (gitignored real models, the GT-copy
+`_computed-values.json`, the per-sheet-eval Windows fix, the bench
+`discoverModels` gate vs the `-v2` regen). PLAN points to it.
+
+## 2026-05-28 — Roadmap: PE-model regeneration findings (Mippy consumer)
+
+The downstream Mippy agent regenerated both PE engines from `main` and
+reported back. Captured the findings in ROADMAP.md ("Now — PE-model regeneration
+findings"). Confirmed wins vs the old build: **dates fixed** (old leaked
+`ExcelDateTime { … }` debug strings — 2,686 in A-1; new emits serial numbers, 0
+leaks), **~42–45% smaller** (model-map.json + the GT-copy `_computed-values.json`
+gone), contract maps emitted, circular refs converge, and a **golden-master PASS**
+— the regenerated ground truth reproduces the hand-port's canonical A-1 returns
+to full float precision (Version Tracker row 22). New follow-ups: generation
+robustness on big models (dep-graph OOM + `init` 10-min timeout — issue #23),
+`--output-profile` to scope artifacts (#22), the **11,813 `_fn()` unsupported-
+function fallbacks** per engine (transpiler-coverage accuracy suspect), the
+refiner mis-mapping returns to a "UW Comparison" tab, empty `named-inputs.json`
+when no formula-referenced defined-names exist, and MIP-as-output (#7). A
+ready-made golden-master CI assert (diff committed `named-outputs.baseCaseValue`)
+is noted.
+
+## 2026-05-28 — Circular-cluster eval: scoped convergence diff + first cluster test
+
+Progress on the circular-cluster accuracy blocker (the 17-of-21-sheet cluster on
+the real models that wouldn't evaluate).
+
+- **Scoped convergence diff.** The cluster convergence loop in
+  `per-sheet-eval.mjs` checked for a fixed point by diffing **every** cell in the
+  context each iteration — and the context is seeded with the full (multi-million-
+  cell) ground truth, so that was O(all cells) × up to 200 iterations. It now
+  tracks the cells `compute()` actually writes (`ctx._written`) and diffs only
+  those (the cluster's own outputs). Behavior-preserving; large constant-factor
+  win on big clusters.
+- **First circular-cluster test + fixture.** `tests/cli/fixtures/cluster-model/`
+  is a synthetic 2-sheet circular model (SheetA ↔ SheetB, converges to
+  a=50,b=50,c=100,d=100). `tests/cli/test-per-sheet-eval.mjs` now evaluates it
+  through the convergence loop and asserts 100% — the cluster path had no
+  coverage before, and this guards the scoped-diff change.
+
+**Still the key fix (cluster-once):** measured on the real model, scoped-diff
+alone is *not* enough — `per-sheet-eval` re-runs the entire cluster convergence
+**once per member sheet** (17×), and engine inaccuracies keep some clusters from
+converging (200 iters). The remaining work is single-pass orchestrator eval:
+converge the cluster once, then score every member from that converged state
+(one task per cluster, not per sheet). The fixture above is the ready-made test
+oracle. Until then the benchmark runs with `--skip-clusters`.
+
+## 2026-05-28 — Unit tests for lib/ (Polish→Publish)
+
+The shared financial libraries had no direct coverage. Added
+`tests/lib/test-lib.mjs` (43 known-answer assertions), wired into `npm test`
+(runs first) so CI guards them on every push:
+
+- **`lib/irr.mjs`** — NPV/NPV-derivative identities; IRR of classic cash-flow
+  series (−100→+150 = 50%, −1000 then 200×8 ≈ 11.89%, 3-year bullet); Newton ≡
+  bisection agreement; NPV(IRR) ≈ 0; null on no-sign-change; XIRR on dated flows.
+- **`lib/waterfall.mjs`** — American 80/20 + 8% pref + catch-up (LP/GP splits,
+  carry %), no-catch-up variant, loss case (no carry), the flat-MOIC-hurdle
+  promote (incl. the hold-period-independence invariant), European builder; the
+  LP+GP = distributed conservation invariant across structures.
+- **`lib/calibration.mjs`** — nested get/set; `validateOutputs` pass/fail +
+  suggested corrective factor.
+- **`lib/sensitivity.mjs`** — `flattenOutputs` group/type filtering.
+
+## 2026-05-28 — PE-model accuracy benchmark + eval-tooling fixes
+
+Stood up a repeatable accuracy + efficacy benchmark over the real ~200 MB
+PE models so improvements can be tracked over time, and fixed the eval
+tooling that was silently broken on them.
+
+### Benchmark (`benchmarks/bench.mjs`, `npm run bench`)
+
+- Wraps `eval/per-sheet-eval.mjs` (live engine-vs-ground-truth) for every model
+  under a root dir; reports overall accuracy, per-sheet pass/skip counts, and
+  timings. **Aggregate-only** results go to the committed `benchmarks/BASELINE.md`;
+  full per-sheet detail stays in the gitignored `benchmarks/results/`. No cell
+  value or label is ever committed.
+- **Baseline (2026-05-28):** Model A **84.3%**, Model B **85.5%** on the
+  standalone sheets. (The 17-sheet circular cluster and the 190 MB PP&E sheet are
+  skipped for now — see below.)
+
+### per-sheet-eval fixes (it wasn't in CI, so these went unnoticed)
+
+- **Windows crash fixed.** The generated per-sheet wrapper imported each sheet's
+  `compute()` by a bare absolute path (`"C:\\..."`), which Node ESM rejects on
+  Windows — so *every* sheet "crashed" at load (0% accuracy) on Windows and on
+  the real engines. Now uses `pathToFileURL()`. New `tests/cli/test-per-sheet-eval.mjs`
+  (6) guards it; CI runs it on **windows-latest** too.
+- **`--skip-clusters`** flag: record circular-cluster sheets as skipped instead
+  of evaluating them. The current convergence path re-runs the *whole* cluster
+  once per member sheet (O(cluster²)), which is infeasible on big models; this
+  yields a fast, real number for the standalone sheets while the single-pass
+  orchestrator eval is built (ROADMAP).
+
+### searchByLabel: lazy numerics (query / carry)
+
+`searchByLabel` previously scanned the entire ground truth once per matched row
+to collect adjacent numerics. It now probes the row's columns on demand (same
+approach as the refiner), with a directed `caseColumn` lookup probing its exact
+cell so a far scenario column is never missed. Behavior-preserving (query/carry/
+ai-interface suites green).
+
+### Findings that scope the accuracy-blocker work
+
+- The 190 MB PP&E sheet exceeds the 150 MB per-sheet limit → **large-sheet eval**
+  blocker confirmed.
+- The circular cluster is **17 of 21 sheets** and is evaluated redundantly
+  (once per member) → the concrete reason behind "circular-cluster won't
+  evaluate." Single-pass orchestrator eval is the fix.
+- `_computed-values.json` in these engines is **byte-identical to ground truth**
+  (a seeded copy), so it is not a valid accuracy source — accuracy must come from
+  live recompute.
+
 ## 2026-05-28 — `init` parses the ground truth once (shared across the pipeline)
 
 The real driver behind the "~2.5 min" refine loop wasn't one command — it was
 that `ete init` runs **generate → refine → doctor → maps** in sequence and
 **each independently re-read and re-parsed the full ground truth** from disk. On
-the real ~200 MB Outpost models that's four parses of a 200 MB+ file at ~3.6 s
+the real ~200 MB PE models that's four parses of a 200 MB+ file at ~3.6 s
 each, plus each command's own O(N) scan.
 
 ### What changed
@@ -23,7 +180,7 @@ each, plus each command's own O(N) scan.
 
 ### Why not the row-values artifact (Tier B)
 
-Measured on both real ~200 MB Outpost models: they're **dense-label** (≈90% of
+Measured on both real ~200 MB PE models: they're **dense-label** (≈90% of
 rows labeled, ≈93% of numerics on labeled rows), not the giant-grid case Tier B's
 big win assumed. A general row-values artifact would be ≈30% of GT (≈60% of the
 post-#17 compact GT) — only ~1.6× on refine while inflating output ~60%, fighting
diff --git a/HANDOFF.md b/HANDOFF.md
new file mode 100644
index 0000000..d1a1703
--- /dev/null
+++ b/HANDOFF.md
@@ -0,0 +1,131 @@
+# HANDOFF — excel-to-engine next session
+
+Start-here doc for a fresh agent. Read this, then `ROADMAP.md` (full backlog),
+`PLAN.md` (status), `benchmarks/BASELINE.md` (accuracy numbers), and your two
+project memory files (the Mippy contract + the real-model shape/baseline notes,
+auto-loaded from your memory index).
+
+_Last updated: 2026-05-28._
+
+## The job, in one line
+
+**Make the full PE model a reliable Mippy calibration oracle: runnable,
+with the MIP coefficients exposed as named-outputs, and no stubbed value cells.**
+Everything Mippy-specific stays in Mippy — this repo just produces a trustworthy,
+sample-able engine + contract.
+
+## Where things stand
+
+**Merged to `main` this session:** artifact slimming (#17), GitHub Actions CI
+(#18, ubuntu+windows), `refine` consumes `_labels.json` + lazy numerics (#19),
+single-GT-parse per `init` (#20).
+
+**Open PR — review/merge first:** **#21 `feat/next-wave`** (CI green). Contains
+the the PE model accuracy **benchmark + baseline**, a **per-sheet-eval Windows crash
+fix**, `searchByLabel` lazy numerics, **lib/ unit tests** (43), the
+**scoped cluster-convergence diff** + the first circular-cluster fixture/test,
+and the Mippy regeneration findings in ROADMAP. **If #21 isn't merged yet, branch
+off `feat/next-wave`; otherwise off `main`.**
+
+**Baseline (real models, `npm run bench`):** Model A **84.3%**,
+Model B **85.5%** — standalone sheets only (cluster + 190 MB PP&E skipped).
+
+## How to run
+
+```bash
+npm test                 # full JS suite (387 assertions)
+npm run smoke            # chunked-engine accuracy 78/78
+npm run bench --  --root "<abs path>/engines"   # accuracy + efficacy on the real models
+node eval/per-sheet-eval.mjs <chunkedDir> --concurrency 3 [--skip-clusters]
+cd pipelines/rust && cargo build --release   # the parser
+```
+
+The real PE models live in the **gitignored** `engines/` dir (proprietary —
+never commit values/labels). The Mippy agent's fresh regen is in
+`the regenerated `-v2` engine dirs` (the *better* build: dates fixed, slimmed) alongside
+the old `the `engines/` model dirs`.
+
+## P1–P3 — Mippy calibration-oracle feature set (do in this order)
+
+All filed on ebootheee/excel-to-engine. Done-criteria are the contract.
+
+### P1 · #23 + #24 — reliably emit a runnable `engine.js` ★ blocks everything
+A clean `ete init` on a real model currently **does not finish**: the Rust parser
+is OOM-killed at the cell-level dependency-graph step, and `ete init` hits its
+10-min `spawnSync` cap → `engine.js` (the `run()` orchestrator) + the
+`dependency-graph.json` closures **don't land** (written after the OOM step).
+- **Done =** `chunked/engine.js` with `export function run()` exists on **every**
+  build; the build **errors hard** if it can't — **never a partial artifact**.
+- #24 also: **lock the artifact layout + emit a content hash** so downstream
+  consumes without per-version reconciliation.
+- Without a runnable engine we can't sample MIP to calibrate/validate — this
+  gates everything below.
+- Files: `pipelines/rust/` (dep-graph build: stream/incrementalize or raise
+  headroom; fail-loud), `cli/commands/init.mjs` (configurable timeout; don't
+  swallow a failed emit).
+
+### P2 · #25 — pin the value-bearing cells as named-outputs
+Per-class **MIP Proceeds**, **hurdle/threshold**, **participation %**, **equity
+basis**, **valuation / shares** — not just MOIC/IRR.
+- **Done =** those appear in `named-outputs.json` with base-case values. **These
+  ARE the parametric coefficients Mippy calibrates against.**
+- Files: `lib/manifest-maps.mjs` (`enumerateOutputCells` — extend beyond
+  MOIC/IRR/TV/carry; `customCells` is the current escape hatch),
+  `cli/commands/manifest*.mjs`. Pin per-model (the auto-manifest mis-maps —
+  see the refiner fix under "supporting").
+
+### P2 · #26 — `_fn` fallback audit: emit `_fn-fallbacks.json` (correctness gate)
+- **Done =** we can **assert no MIP / value / return cell resolves through an
+  unsupported-function stub.** (Auditing/gating the value cells — distinct from
+  fixing all 11,813 fallbacks, which is the deeper transpiler work below.)
+- Files: `pipelines/rust/` (emit the audit during transpile) + a check that the
+  P2/#25 named-output cells aren't in it.
+
+### P3 (nice-to-have) · #22 — output-cone scoping
+Scope generated artifacts to the consumer's need (skip the ~752 MB per-sheet
+emit). Makes the oracle cheaper to run; **not required** — we don't ship the blob.
+
+## Supporting work — makes the oracle *trustworthy* (after P1, alongside P2/P3)
+
+These aren't on Mippy's critical path but back the "reliable" in "reliable
+calibration oracle":
+- **Golden-master CI assert** — A-1's regenerated GT matches the hand-port's
+  canonical gross/net MOIC & IRR (Version Tracker row 22) to full float
+  precision. Add a CI test diffing those `named-outputs.baseCaseValue`s. The
+  canonical figures live in the gitignored `named-outputs.json` + project memory
+  — **do NOT commit the figures to this public repo.** Pairs with #25/#26.
+- **Refiner mis-maps returns to a "UW Comparison" tab** instead of the canonical
+  Version Tracker returns — `SUMMARY_SHEET_PATTERN` over-ranks it. Fix so #25's
+  value cells pin to canonical/Version-Tracker tabs without manual per-model
+  pinning. Add a manifest invariant. File: `cli/commands/manifest-refine.mjs`.
+- **Deeper transpiler coverage** — the 11,813 `_fn()` offenders behind #26's
+  audit; inventory by frequency, implement top ones. `pipelines/rust/src/`.
+- **Cluster-once eval** (our accuracy harness, not Mippy's path): the 17-sheet
+  cluster is unmeasured because `per-sheet-eval` re-runs the whole convergence
+  once per member (17×). Make it one task per cluster (converge once, score all),
+  then drop `--skip-clusters` and re-baseline. Lets us *verify* the oracle's
+  cluster math. Fixture oracle ready: `tests/cli/fixtures/cluster-model/`. (The
+  shipped `engine.js` `run()` converges clusters itself — this is measurement.)
+- **Large-sheet eval** (190 MB PP&E > 150 MB limit) and **manifest-pipeline
+  perf** (generate detectors / maps cell-types / refine fallback on ~6M cells).
+
+## Polish → Publish
+lib/ unit tests done. Remaining: npm publish prep (`bin`, `files`, metadata),
+synthetic example project, contributing guide. Lower: empty `named-inputs.json`
+fallback (no formula-referenced defined-names in the PE workbooks);
+MIP-as-output beyond the pinned cells is a model-owner question.
+
+## Gotchas (will bite you)
+
+- **`engines/` is gitignored** (real financials). Read-only; aggregate metrics
+  only. `_eval_tmp/` + `benchmarks/results/` are gitignored too.
+- **`_computed-values.json` in these engines is a byte-identical COPY of ground
+  truth** (seeded). NOT a valid accuracy source — use live recompute.
+- **per-sheet-eval was Windows-broken** (bare absolute ESM import → `pathToFileURL`
+  fix; guarded by `tests/cli/test-per-sheet-eval.mjs` on windows CI). Don't
+  reintroduce bare absolute `import` paths.
+- **`benchmarks/bench.mjs` `discoverModels()` gates on `engine.js`** — but
+  the `-v2` regen dirs may LACK it (the #23 OOM) while having `_graph.json` +
+  `sheets/`. If the bench skips `-v2`, relax the gate. (Fixing #23 makes this moot.)
+- **CI runs ubuntu + windows** — child-process/path/parser code must work on both.
+- After any change, update CHANGELOG/PLAN/ROADMAP per CLAUDE.md.
diff --git a/PLAN.md b/PLAN.md
index 8d36e2a..0a28e8a 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -1,11 +1,39 @@
 # excel-to-engine — Plan
 
+> **Next session: start at [`HANDOFF.md`](HANDOFF.md)** — prioritized backlog
+> (P0 cluster-once eval → generation robustness → `_fn()` coverage → refiner
+> fix → golden-master CI → …), current state, run commands, and gotchas.
+
+## Status: PE-model accuracy benchmark + eval fixes — in progress 2026-05-28
+
+Standing up the multi-wave "next wave" effort on `feat/next-wave`, keystone
+first: a repeatable accuracy + efficacy benchmark over the real PE models
+(`benchmarks/bench.mjs` → `benchmarks/BASELINE.md`, aggregate-only).
+
+**Baseline:** Model A 84.3%, Model B 85.5% on standalone sheets; the
+17-sheet circular cluster and the 190 MB PP&E sheet are skipped pending deeper
+fixes. Landed alongside: a **Windows crash fix** in `per-sheet-eval` (bare
+absolute ESM import → `pathToFileURL`; it had silently zeroed accuracy on
+Windows/real engines and wasn't in CI — now guarded by `test-per-sheet-eval`),
+a `--skip-clusters` flag, and the **searchByLabel lazy-numerics** wave
+(query/carry stop scanning the full GT for adjacent values).
+
+**Wave status (this branch):**
+- ✅ Keystone benchmark + baseline; ✅ searchByLabel (query/carry).
+- 🔜 Accuracy blockers — now precisely diagnosed: single-pass orchestrator eval
+  for the 17-sheet cluster (it's re-run once per member today), large-sheet eval
+  (190 MB PP&E > 150 MB limit), array formulas (the Headcount sheet lives inside
+  the cluster). `_computed-values.json` is a GT copy — not an accuracy source.
+- 🔜 Manifest-pipeline perf (generate detectors / maps cell-types on ~6M cells).
+- 🔜 Polish→Publish (lib/ unit tests, npm publish prep, example project,
+  contributing guide).
+
 ## Status: single GT parse per init — landed 2026-05-28
 
 `ete init` now loads the ground truth once and shares the parsed object across
 the whole manifest pipeline — generate → refine → doctor → maps — instead of
 each step re-reading and re-parsing the full ground truth from disk (up to four
-parses of a 200 MB+ file; ~3.6 s per parse on the real Outpost models). This was
+parses of a 200 MB+ file; ~3.6 s per parse on the real PE models). This was
 the dominant cost of init on large models and the real driver behind the
 "~2.5 min" refine loop. The GT is read-only in all four consumers, so a single
 shared object is safe; each command falls back to loading the GT itself when no
@@ -291,7 +319,8 @@ excel-to-engine/
 - [ ] Wide sheet column disambiguation for blind eval
 
 ## Next Phase — Polish + Publish
-- [ ] Unit tests for all lib/ modules
+- [x] Unit tests for all lib/ modules — `tests/lib/test-lib.mjs` (43: irr,
+      waterfall, calibration, sensitivity), in `npm test`/CI (2026-05-28)
 - [x] GitHub Actions CI — `.github/workflows/ci.yml` (ubuntu + windows; Rust
       build/tests + JS suite + smoke/depgraph/engine/slimming), landed 2026-05-28
 - [ ] npm publish preparation
diff --git a/ROADMAP.md b/ROADMAP.md
index c6786bb..b87c220 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -42,6 +42,91 @@ RPC service. Round 1 split into a low-risk JS half (done) and a Rust half
   (engine faithfully reproduces an Excel base case of 0). Surface via a
   `requiredFor` field if/when named-inputs gains one.
 
+## Now — Mippy calibration oracle (e2e agent's job, priority feature set)
+
+The refined "fully ready for Mippy" target: make the full model a **reliable
+calibration oracle** — runnable, with the MIP coefficients exposed as
+named-outputs, and no stubbed value cells. Everything Mippy-specific stays in
+Mippy. Order (issues on ebootheee/excel-to-engine; the Done line is the contract):
+
+- **P1 · [#23] + [#24] — reliably emit a runnable `engine.js`.** Fix the
+  dep-graph OOM; **fail the build loud, never emit a partial artifact.**
+  **Done =** `chunked/engine.js` with `export function run()` exists on every
+  build; build errors hard if it can't. #24 also locks the artifact layout +
+  emits a content hash (consume downstream without per-version reconciliation).
+  Gates everything: without a runnable engine we can't sample MIP to calibrate.
+- **P2 · [#25] — pin the value-bearing cells as named-outputs.** Per-class MIP
+  Proceeds, hurdle/threshold, participation %, equity basis, valuation/shares —
+  not just MOIC/IRR. **Done =** they appear in `named-outputs.json` with
+  base-case values. These ARE the parametric coefficients.
+- **P2 · [#26] — `_fn` fallback audit (`_fn-fallbacks.json`).** **Done =** assert
+  no MIP/value/return cell resolves through an unsupported-function stub.
+- **P3 (nice-to-have) · [#22] — output-cone scoping.** Cheaper oracle; not
+  required (we don't ship the blob).
+
+Supporting (makes the oracle trustworthy, not on the critical path): golden-master
+CI assert (A-1 canonical returns), the refiner UW-Comparison fix (so #25's cells
+pin to canonical tabs), deeper transpiler coverage (the 11,813 `_fn` offenders
+behind #26), cluster-once eval (our accuracy harness), large-sheet eval, pipeline
+perf. See `HANDOFF.md` for the full ordering + Done criteria.
+
+[#24]: https://github.com/ebootheee/excel-to-engine/issues/24
+[#25]: https://github.com/ebootheee/excel-to-engine/issues/25
+[#26]: https://github.com/ebootheee/excel-to-engine/issues/26
+
+## Now — PE-model regeneration findings (Mippy consumer, 2026-05-28)
+
+The downstream Mippy agent regenerated both PE engines from `main` (current
+excel-to-engine) → `the regenerated `-v2` engine dirs`, old build left alongside.
+Confirmed the new build is clearly better and surfaced concrete follow-ups.
+Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
+
+**Confirmed better than the old (pre-our-work) build:**
+- **Dates fixed.** Old leaked Rust debug strings (`ExcelDateTime { value: 45960.0,
+  … }` — 2,686 in A-1, breaking date math); new emits serial numbers, 0 leaks.
+- **~42–45% smaller** (~1.9–2.0 GB → ~1.1 GB): `model-map.json` (606 MB) +
+  `_computed-values.json` (192 MB) gone — the #8 slimming + dropping the GT-copy.
+- Semantic manifest + ADR-017 contract maps emitted; circular refs now run
+  per-cluster fixed-point loops.
+- **Golden master PASS.** Regenerated `_ground-truth.json` reproduces the
+  hand-port's canonical A-1 gross/net MOIC & IRR (Version Tracker row 22) to full
+  float precision. Pinning A-1's manifest to those cells makes
+  `named-outputs.baseCaseValue` a ready CI golden-master assert. **Do this:** add
+  a golden-master test diffing those baseCaseValues. (Canonical figures stay in
+  the gitignored artifacts + project memory — not committed to this public repo.)
+
+**Open follow-ups:**
+- **Generation robustness on big models ([#23]) — blocks a clean full build.**
+  Plain `ete init` hit its 10-min `spawnSync` cap, and the Rust parser was
+  OOM-killed at the cell-level dependency-graph step → `engine.js` (the `run()`
+  orchestrator) and `dependency-graph.json` closures **didn't land** (written
+  after the OOM step); regen needed direct-parse then `--reuse-parse`. Needs:
+  stream/incrementalize the dep-graph build (or raise its memory headroom),
+  within-sheet parallelism, streaming writes, and a higher/configurable init
+  timeout.
+- **`--output-profile` / guided `ete create` ([#22]).** Skip the ~752 MB
+  per-sheet engine emit when a consumer only needs ground truth + contract maps.
+- **Transpiler coverage — 11,813 `_fn()` fallbacks (unchanged old→new).** That
+  many formula cells still transpile to a generic unsupported-function stub — a
+  prime accuracy suspect once cluster eval makes per-sheet accuracy measurable.
+  Inventory the missing Excel functions and prioritize by frequency. (See
+  Transpiler Coverage below.)
+- **Refiner mis-maps returns to the "UW Comparison" tab.** Auto-manifest picked an
+  underwriting-comparison cell over the canonical Version Tracker returns —
+  `SUMMARY_SHEET_PATTERN` over-ranks "UW Comparison". The refiner should recognize
+  canonical returns / Version-Tracker tabs (or de-prioritize
+  underwriting-comparison tabs) so returns don't need manual per-model pinning.
+- **`named-inputs.json` empty** when a workbook exposes no formula-referenced
+  defined-names (this case) — ADR-019 ranged inputs can't be auto-derived;
+  needs a heuristic fallback or a documented manual-input path.
+- **MIP isn't a generated output (request #7).** The MIP figure is a hand-port
+  calibration, not a single GT cell — MIP is modeled across per-block "MIP
+  Proceeds" cells. Surface via a `requiredFor`/aggregate mapping, not a
+  single-cell expectation. (See the Round 2 MIP-gating note above.)
+
+[#22]: https://github.com/ebootheee/excel-to-engine/issues/22
+[#23]: https://github.com/ebootheee/excel-to-engine/issues/23
+
 ## Now — Security Hardening Follow-ups (post-PR #13)
 
 Non-blocking items surfaced during the v0.2.0 security audit pass. Open
@@ -88,7 +173,7 @@ when we next touch the monitor server or auth surface.
     generate → refine → doctor → maps (each previously re-parsed the full
     200 MB+ GT). The GT is read-only in all of them. `test-init-shared-gt`.
   - **Tier B (row-values artifact) — measured and deprioritized.** Gauged on
-    the two real ~200 MB Outpost models: both are **dense-label** (≈90% of rows
+    the two real ~200 MB PE models: both are **dense-label** (≈90% of rows
     labeled, ≈93% of numerics on labeled rows), *not* the giant-grid case the
     100× idea assumed. A general row-values artifact is ≈30% of GT (≈60% of the
     post-#17 compact GT) → only ~1.6× on refine while inflating output ~60%,
@@ -96,8 +181,10 @@ when we next touch the monitor server or auth surface.
     ~70 KB) but extracting it cheaply would couple the parser to refine's metric
     vocabulary. Not worth it on these models; revisit only if a genuinely
     giant-grid model (mostly unlabeled numeric grids) shows up.
-  - **Still open:** apply the same lazy-numerics path to `searchByLabel`
-    (`query` / `carry`) so they stop scanning the GT for adjacent values.
+  - **Done (2026-05-28):** applied the same lazy-numerics path to `searchByLabel`
+    (`query` / `carry`) — probes the matched row's columns instead of scanning
+    the whole GT, with a directed `caseColumn` probe so a far scenario column is
+    never missed.
 - Manifest migration tooling for model updates (vN → vN+1 shape diff).
 
 ---
@@ -135,6 +222,11 @@ when we next touch the monitor server or auth surface.
 ## Ongoing — Accuracy Improvement + Production Learnings
 
 ### Transpiler Coverage
+- **Measured (Mippy regen, 2026-05-28): 11,813 `_fn()` unsupported-function
+  fallbacks per the PE model engine** — that many formula cells transpile to a generic
+  stub instead of real logic, a prime accuracy suspect. First step: inventory
+  which Excel functions hit the fallback and rank by frequency, then implement
+  the top offenders. (Was unchanged old→new, so it predates our work.)
 - Implement INDIRECT function (dynamic cell references)
 - Fix 2D range handling edge cases for very large sheets
 - Handle array formulas / CSE (Ctrl+Shift+Enter) patterns
@@ -147,22 +239,45 @@ when we next touch the monitor server or auth surface.
 - **Pref compounding for long holds** — 12-year 8% compound pref = 2.52x hurdle, which exceeds many MOIC targets. Need to detect when models use quarterly cash flow waterfalls vs bullet maturity and adjust accordingly.
 
 ### Eval System
-- Increase blind eval question diversity (computed questions, cross-sheet aggregations)
-- Add time-period-aware questions ("What was X in Q3 2025?")
-- Profile and optimize per-sheet eval for sheets >150MB
+- **Done (2026-05-28):** repeatable accuracy + efficacy benchmark over the real
+  PE models — `benchmarks/bench.mjs` → `benchmarks/BASELINE.md`
+  (aggregate-only). Baseline: a1 84.3%, a2 85.5% on standalone sheets. Also
+  **fixed a Windows crash** in `per-sheet-eval` (bare absolute ESM import →
+  `pathToFileURL`; it had zeroed accuracy on Windows/real engines and wasn't in
+  CI — now guarded by `test-per-sheet-eval`, run on windows-latest).
+- **Large-sheet eval (190 MB PP&E):** confirmed it exceeds the 150 MB per-sheet
+  limit and is skipped. Needs streaming/sharded per-sheet eval or a higher limit
+  with chunked compute. The standalone sheets at ~85% also need attention (array
+  formulas / wide-sheet disambiguation) — visible now that the eval runs.
+- Increase blind eval question diversity; add time-period-aware questions.
 
 ### Convergence Loop Accuracy
-- The 62-sheet circular cluster in the large model is the biggest accuracy blocker
-- Investigate running eval through the orchestrator (not per-sheet isolation) for circular sheets
-- Consider lazy subgraph evaluation (only compute transitive closure of target cells)
+- **Diagnosed (2026-05-28):** on the real models the circular cluster is **17 of
+  21 sheets**, and `per-sheet-eval` re-runs the *entire* cluster convergence once
+  per member sheet (O(cluster²)) — that's why clustered big models "won't
+  evaluate." The array-formula Headcount sheet lives inside this cluster, so it's
+  unmeasurable until this is fixed. `--skip-clusters` skips them for now.
+- **Done (2026-05-28):** scoped the convergence diff to written cells
+  (`ctx._written`) instead of all ~6M seeded cells per iteration. Added a
+  synthetic 2-sheet circular fixture (`tests/cli/fixtures/cluster-model/`) + the
+  first cluster test. Measured: scoped-diff alone is **not** enough — the 17×
+  per-member redundancy dominates.
+- **Remaining key fix (cluster-once):** single-pass orchestrator eval — converge
+  the cluster once, then score every member from that converged state (one task
+  per cluster, not per sheet); then drop `--skip-clusters` from the benchmark.
+  The cluster fixture is the ready test oracle.
+- Consider lazy subgraph evaluation (only compute transitive closure of targets).
 
 ## Near-Term
 
 ### Unit Test Suite
-- Tests for `lib/irr.mjs` with known IRR cases
-- Tests for `lib/waterfall.mjs` with standard structures
-- Tests for `lib/calibration.mjs` convergence and edge cases
-- Tests for `lib/excel-parser.mjs` fingerprinting with synthetic workbooks
+- **Done (2026-05-28):** `tests/lib/test-lib.mjs` (43) — `lib/irr.mjs` (known
+  IRR/NPV/XIRR cases), `lib/waterfall.mjs` (American/European/MOIC-hurdle +
+  conservation invariant), `lib/calibration.mjs` (nested get/set, validate),
+  `lib/sensitivity.mjs` (flattenOutputs). In `npm test` / CI.
+- Still open: `lib/calibration.mjs` convergence/edge cases (calibrate loop),
+  `lib/sensitivity.mjs` surface extraction + elasticity/breakpoints, and
+  `lib/excel-parser.mjs` fingerprinting with synthetic workbooks.
 
 ### CI Pipeline
 - **Done (2026-05-28):** `.github/workflows/ci.yml` — on push/PR to `main`,
diff --git a/benchmarks/BASELINE.md b/benchmarks/BASELINE.md
new file mode 100644
index 0000000..d51f825
--- /dev/null
+++ b/benchmarks/BASELINE.md
@@ -0,0 +1,24 @@
+# model benchmark — baseline & history
+
+Real accuracy: each standalone sheet recomputed live vs ground truth via
+`eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).
+Circular-cluster sheets and oversized sheets are **skipped** for now (see
+the Skipped column + blockers below) pending the single-pass orchestrator
+eval; run with `--with-clusters` once that lands. Aggregate-only — no cell
+values or full sheet inventory. Regenerate:
+`node benchmarks/bench.mjs --root <engines>`. Full per-sheet detail
+lands in the gitignored `benchmarks/results/`.
+
+_Last run: baseline-2026-05-28_
+
+| Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |
+|-------|---------:|------:|:-----------:|:-------:|----------:|---:|
+| Model A | 84.33% | 1491/1768 | 1/3 | 17 | 41s | 201.5 MB |
+| Model B | 85.54% | 1686/1971 | 2/4 | 17 | 45s | 211 MB |
+
+## Known blocker categories
+
+Tracked by name because PLAN.md already calls them out; values are accuracy %, not financials.
+
+- **Model A**: 1/3 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
+- **Model B**: 2/4 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
diff --git a/benchmarks/bench.mjs b/benchmarks/bench.mjs
new file mode 100644
index 0000000..aab6304
--- /dev/null
+++ b/benchmarks/bench.mjs
@@ -0,0 +1,178 @@
+#!/usr/bin/env node
+/**
+ * model benchmark — repeatable accuracy + efficacy tracking over the real
+ * models, so we can see whether each improvement actually moves the needle.
+ *
+ * It wraps `eval/per-sheet-eval.mjs` (which runs each sheet module live against
+ * ground truth, converges circular clusters, and skips/handles oversized
+ * sheets) for every model under a root dir, then aggregates. Accuracy is real
+ * (engine recompute vs ground truth, 1% rel. tol / exact strings) — NOT the
+ * `_computed-values.json` snapshot, which is a copy of ground truth (trivially
+ * 100%), nor the full-engine `run()`, which is infeasible on these models (the
+ * 190 MB PP&E sheet).
+ *
+ * Privacy: the real models are proprietary (gitignored). Full per-sheet detail
+ * (incl. per-cell failures) stays in the gitignored `benchmarks/results/`. Only
+ * AGGREGATE, non-identifying metrics — overall accuracy, sheet pass/skip counts,
+ * timings, and the already-public blocker categories (PP&E, Headcount) — go to
+ * the committed `benchmarks/BASELINE.md`. No cell value or label is ever
+ * printed or committed.
+ *
+ * Usage:
+ *   node benchmarks/bench.mjs [--root <dir>] [--concurrency 3] [--stamp <label>]
+ *
+ * Run it after any change that could affect accuracy or pipeline speed, then
+ * diff benchmarks/BASELINE.md to see the delta.
+ */
+
+import { readFileSync, writeFileSync, existsSync, statSync, readdirSync, mkdirSync } from 'fs';
+import { join, resolve, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { execFile } from 'child_process';
+import { promisify } from 'util';
+
+const execFileAsync = promisify(execFile);
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const REPO_ROOT = resolve(__dirname, '..');
+const EVAL_SCRIPT = join(REPO_ROOT, 'eval', 'per-sheet-eval.mjs');
+const RESULTS_DIR = join(__dirname, 'results');
+
+function flag(name, fallback) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i >= 0 && process.argv[i + 1] ? process.argv[i + 1] : fallback;
+}
+
+const ROOT = resolve(flag('root', join(REPO_ROOT, 'engines')));
+const CONCURRENCY = flag('concurrency', '3');
+const STAMP = flag('stamp', new Date().toISOString().replace(/[:.]/g, '-'));
+
+function discoverModels(root) {
+  if (!existsSync(root)) return [];
+  const out = [];
+  for (const name of readdirSync(root)) {
+    const chunked = join(root, name, 'chunked');
+    if (existsSync(join(chunked, 'engine.js')) && existsSync(join(chunked, '_ground-truth.json'))) {
+      out.push({ name, chunked });
+    }
+  }
+  return out.sort((a, b) => a.name.localeCompare(b.name));
+}
+
+async function benchModel(model) {
+  mkdirSync(RESULTS_DIR, { recursive: true });
+  const reportPath = join(RESULTS_DIR, `persheet-${model.name}-${STAMP}.json`);
+  const gtSizeMB = +(statSync(join(model.chunked, '_ground-truth.json')).size / 1e6).toFixed(1);
+
+  const t = Date.now();
+  let evalError = null;
+  try {
+    // --skip-clusters by default: circular-cluster sheets need the single-pass
+    // orchestrator eval (a follow-up); evaluating them per-sheet is infeasible on
+    // big models. Pass --with-clusters once that lands. Standalone sheets give a
+    // fast, real accuracy number; skipped sheets are reported with their reason.
+    const evalArgs = ['--max-old-space-size=8192', EVAL_SCRIPT, model.chunked, '--output', reportPath, '--concurrency', String(CONCURRENCY)];
+    if (!process.argv.includes('--with-clusters')) evalArgs.push('--skip-clusters');
+    await execFileAsync('node', evalArgs, { maxBuffer: 64 * 1024 * 1024 });
+  } catch (e) {
+    evalError = (e.stderr || e.message || String(e)).slice(0, 200);
+  }
+  const wallMs = Date.now() - t;
+
+  if (!existsSync(reportPath)) {
+    return { name: model.name, evalError: evalError || 'no report produced', efficacy: { wallMs, gtSizeMB } };
+  }
+  const report = JSON.parse(readFileSync(reportPath, 'utf-8'));
+  // Strip topFailures (real cell values) from anything we keep in memory for the
+  // committed summary; the gitignored report file retains full detail.
+  const sheets = (report.sheets || [])
+    .map(s => ({ name: s.name, status: s.status, accuracy: s.accuracy, correct: s.correct, total: s.total }))
+    .sort((a, b) => a.accuracy - b.accuracy);
+
+  return {
+    name: model.name,
+    evalError,
+    summary: report.summary,
+    skipped: report.skipped || [],
+    sheets,
+    efficacy: { wallMs, gtSizeMB },
+  };
+}
+
+// ── Run ──────────────────────────────────────────────────────────────────────
+const models = discoverModels(ROOT);
+if (models.length === 0) {
+  console.error(`No models found under ${ROOT} (need <model>/chunked/engine.js + _ground-truth.json).`);
+  console.error('Point --root at a dir of parsed engines (the real PE engines live in the gitignored engines/).');
+  process.exit(2);
+}
+
+// Anonymize model identity in printed + committed output (Model A, B, …). Real
+// dir names stay only in the gitignored detail JSON + the operator's local
+// notes — this repo is public and must not name the real models.
+models.forEach((m, i) => { m.label = `Model ${String.fromCharCode(65 + (i % 26))}`; });
+
+console.log(`model benchmark — ${models.length} model(s) under ${ROOT}\n`);
+const results = [];
+for (const m of models) {
+  process.stdout.write(`  ${m.label} ... `);
+  const r = await benchModel(m);
+  r.label = m.label;
+  results.push(r);
+  if (r.evalError && !r.summary) console.log(`eval FAILED (${r.evalError})`);
+  else {
+    const s = r.summary;
+    console.log(`acc ${s.overallAccuracy}%  (${s.totalCellsCorrect}/${s.totalCellsTested})  ` +
+      `${s.sheetsPassing}/${s.sheetsEvaluated} sheets ≥95%, ${s.sheetsSkipped} skipped  [${(r.efficacy.wallMs / 1000).toFixed(0)}s]`);
+  }
+}
+
+mkdirSync(RESULTS_DIR, { recursive: true });
+const detailPath = join(RESULTS_DIR, `summary-${STAMP}.json`);
+writeFileSync(detailPath, JSON.stringify({ stamp: STAMP, root: ROOT, results }, null, 2));
+console.log(`\nDetail (gitignored): ${detailPath}`);
+
+// ── Committed aggregate (no values, no full sheet inventory) ──────────────────
+function renderBaseline(stamp, results) {
+  const L = [];
+  L.push('# model benchmark — baseline & history');
+  L.push('');
+  L.push('Real accuracy: each standalone sheet recomputed live vs ground truth via');
+  L.push('`eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).');
+  L.push('Circular-cluster sheets and oversized sheets are **skipped** for now (see');
+  L.push('the Skipped column + blockers below) pending the single-pass orchestrator');
+  L.push('eval; run with `--with-clusters` once that lands. Aggregate-only — no cell');
+  L.push('values or full sheet inventory. Regenerate:');
+  L.push('`node benchmarks/bench.mjs --root <engines>`. Full per-sheet detail');
+  L.push('lands in the gitignored `benchmarks/results/`.');
+  L.push('');
+  L.push(`_Last run: ${stamp}_`);
+  L.push('');
+  L.push('| Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |');
+  L.push('|-------|---------:|------:|:-----------:|:-------:|----------:|---:|');
+  for (const r of results) {
+    if (!r.summary) { L.push(`| ${r.label} | eval failed | — | — | — | — | ${r.efficacy.gtSizeMB} MB |`); continue; }
+    const s = r.summary;
+    L.push(`| ${r.label} | ${s.overallAccuracy}% | ${s.totalCellsCorrect}/${s.totalCellsTested} | ` +
+      `${s.sheetsPassing}/${s.sheetsEvaluated} | ${s.sheetsSkipped} | ${(r.efficacy.wallMs / 1000).toFixed(0)}s | ${r.efficacy.gtSizeMB} MB |`);
+  }
+  L.push('');
+  L.push('## Known blocker categories');
+  L.push('');
+  L.push('Tracked by name because PLAN.md already calls them out; values are accuracy %, not financials.');
+  L.push('');
+  const PUBLIC = [/pp&?e/i, /headcount/i];
+  for (const r of results) {
+    if (!r.summary) { L.push(`- **${r.label}**: eval failed`); continue; }
+    const blockers = [];
+    for (const sk of r.skipped) if (PUBLIC.some(re => re.test(sk.name))) blockers.push(`${sk.name} (skipped: ${sk.reason})`);
+    for (const sh of r.sheets) if (PUBLIC.some(re => re.test(sh.name))) blockers.push(`${sh.name} ${sh.accuracy}%`);
+    const lowest = r.sheets.filter(s => s.status !== 'ok' || s.accuracy < 95).length;
+    L.push(`- **${r.label}**: ${r.summary.sheetsEvaluated - lowest}/${r.summary.sheetsEvaluated} sheets clean; ` +
+      `blockers: ${blockers.join('; ') || 'none surfaced'}`);
+  }
+  L.push('');
+  return L.join('\n');
+}
+const baselinePath = join(__dirname, 'BASELINE.md');
+writeFileSync(baselinePath, renderBaseline(STAMP, results));
+console.log(`Baseline (committed): ${baselinePath}`);
diff --git a/eval/per-sheet-eval.mjs b/eval/per-sheet-eval.mjs
index e3339f3..3ee442c 100644
--- a/eval/per-sheet-eval.mjs
+++ b/eval/per-sheet-eval.mjs
@@ -16,7 +16,7 @@
 import { readFile, writeFile, mkdir, stat, readdir, unlink } from 'fs/promises';
 import { existsSync } from 'fs';
 import { join, resolve, basename, dirname } from 'path';
-import { fileURLToPath } from 'url';
+import { fileURLToPath, pathToFileURL } from 'url';
 import { execFile } from 'child_process';
 import { promisify } from 'util';
 
@@ -36,6 +36,12 @@ function getFlag(name, fallback) {
 const OUTPUT_FILE = getFlag('output', join(chunkedDir, '..', 'per-sheet-report.json'));
 const CONCURRENCY = parseInt(getFlag('concurrency', process.env.EVAL_CONCURRENCY || '6'));
 const SAMPLE_SIZE = parseInt(getFlag('sample', process.env.SAMPLE_SIZE || '2000'));
+// --skip-clusters: record circular-cluster sheets as skipped instead of
+// evaluating them. The current convergence path re-runs the whole cluster once
+// per member sheet (O(cluster²) work), which is infeasible on big models; this
+// flag yields a fast, real accuracy number for the standalone sheets while the
+// single-pass orchestrator eval is built. See ROADMAP (circular-cluster eval).
+const SKIP_CLUSTERS = args.includes('--skip-clusters');
 const NODE_HEAP_MB = parseInt(process.env.NODE_HEAP_MB || '8192');
 const MAX_SHEET_SIZE_MB = parseInt(process.env.MAX_SHEET_SIZE_MB || '150');
 
@@ -139,6 +145,11 @@ async function main() {
       continue;
     }
 
+    if (SKIP_CLUSTERS && clusterSheetSet.has(entry.name)) {
+      skipped.push({ name: entry.name, reason: 'circular cluster (--skip-clusters; needs single-pass orchestrator eval)' });
+      continue;
+    }
+
     // Sample ground truth if sheet has too many entries
     let sampleGt = entry.gt;
     if (entry.totalCount > SAMPLE_SIZE) {
@@ -185,15 +196,15 @@ async function main() {
     const cluster = sheetClusters.find(c => c.includes(sheetName));
     const clusterModules = cluster ? cluster.map(s => {
       const san = s.replace(/[^a-zA-Z0-9]/g, '_');
-      const modPath = join(sheetsDir, `${san}.mjs`).replace(/\\/g, '/');
+      const modPath = join(sheetsDir, `${san}.mjs`);
       return { name: s, sanitized: san, path: modPath };
-    }).filter(m => existsSync(join(sheetsDir, `${m.sanitized}.mjs`))) : [];
+    }).filter(m => existsSync(m.path)) : [];
 
     // Build a child process script that loads the sheet module(s) and compares.
     // Paths flow into JS source — interpolate them as JSON-quoted strings so a
     // path containing `'` or `\` can't break out and inject code.
     const clusterImports = clusterModules.length > 0
-      ? clusterModules.map(m => `import { compute as compute_${m.sanitized} } from ${JSON.stringify(m.path)};`).join('\n')
+      ? clusterModules.map(m => `import { compute as compute_${m.sanitized} } from ${JSON.stringify(pathToFileURL(m.path).href)};`).join('\n')
       : '';
     const clusterComputeBlock = clusterModules.length > 0
       ? `
@@ -201,21 +212,22 @@ async function main() {
   const clusterFns = [${clusterModules.map(m => `compute_${m.sanitized}`).join(', ')}];
   const MAX_ITER = 200;
   const TOL = 1e-6;
-  let prevSnapshot = {};
+  const prevSnapshot = {};
   for (let _ci = 0; _ci < MAX_ITER; _ci++) {
     for (const fn of clusterFns) fn(ctx);
-    // Check convergence on numeric values
+    // Convergence is about the cluster's *computed* cells stabilizing, so diff
+    // only the cells the cluster wrote (ctx._written) — not every seeded
+    // ground-truth cell. On a model with millions of seeded cells the old
+    // O(all-cells)-per-iteration diff was a dominant cost (and × 200 iters).
     let maxDelta = 0;
-    const snapshot = {};
-    for (const [k, v] of Object.entries(ctx.values)) {
-      if (typeof v === 'number') {
-        snapshot[k] = v;
-        const prev = prevSnapshot[k] || 0;
-        const d = Math.abs(v - prev);
-        if (d > maxDelta) maxDelta = d;
-      }
+    for (const k of ctx._written) {
+      const v = ctx.values[k];
+      if (typeof v !== 'number') continue;
+      const prev = prevSnapshot[k] || 0;
+      const d = Math.abs(v - prev);
+      if (d > maxDelta) maxDelta = d;
+      prevSnapshot[k] = v;
     }
-    prevSnapshot = snapshot;
     if (_ci > 0 && maxDelta < TOL) break;
   }
 `
@@ -226,7 +238,7 @@ async function main() {
 
     const evalScript = `
 import { readFile } from 'fs/promises';
-import { compute } from ${JSON.stringify(modulePath.replace(/\\/g, '/'))};
+import { compute } from ${JSON.stringify(pathToFileURL(modulePath).href)};
 ${clusterImports}
 
 const allGt = JSON.parse(await readFile(${JSON.stringify(gtFullPath.replace(/\\/g, '/'))}, 'utf8'));
@@ -236,8 +248,9 @@ const cn = s => { let n=0; for(const c of s) n = n*26+c.charCodeAt(0)-64; return
 const nc = n => { let s=''; while(n>0){n--;s=String.fromCharCode(65+(n%26))+s;n=Math.floor(n/26);} return s; };
 const ctx = {
   values: {},
+  _written: new Set(),  // cells written by compute() — the cluster convergence diffs only these
   get(addr) { return this.values[addr] !== undefined ? this.values[addr] : 0; },
-  set(addr, value) { this.values[addr] = value; },
+  set(addr, value) { this.values[addr] = value; this._written.add(addr); },
   _parseRange(rangeStr) {
     const m = rangeStr.match(/^(.+)!([A-Z]+)(\\d+):([A-Z]+)(\\d+)$/);
     if (!m) return null;
diff --git a/lib/manifest.mjs b/lib/manifest.mjs
index e234e8f..119f1d1 100644
--- a/lib/manifest.mjs
+++ b/lib/manifest.mjs
@@ -374,6 +374,19 @@ export function loadLabelIndex(modelDir) {
   return null;
 }
 
+// Column-probe bounds for numericsForRow (label search). Excel's hard ceiling
+// is XFD (16384); we stop after a long run of empty columns so a label-only row
+// costs a few hundred O(1) lookups instead of a full ground-truth scan.
+const LABEL_PROBE_MAX_COL = 16384;
+const LABEL_PROBE_MAX_GAP = 256;
+
+// 1 → "A", 26 → "Z", 27 → "AA". Inverse of the col parsing in buildLabelIndex.
+function numToColLetters(num) {
+  let col = '';
+  while (num > 0) { const r = (num - 1) % 26; col = String.fromCharCode(65 + r) + col; num = Math.floor((num - 1) / 26); }
+  return col;
+}
+
 /**
  * Search ground truth for cells matching a label pattern.
  * Returns matching labels with adjacent numeric values.
@@ -439,20 +452,24 @@ export function searchByLabel(gt, pattern, options = {}) {
   }
 
   // For each candidate, collect adjacent numeric values on the same row.
-  // Index adjacent numerics by (sheet, row) for efficient lookup on repeated rows.
+  // Probe the row's columns on demand (memoized) rather than scanning the whole
+  // ground truth once per row — the same approach the refiner uses. On a 200 MB
+  // ground truth a full scan per matched row is ~tens of ms each; probing the
+  // contiguous numeric block is effectively free. Stops after a long run of
+  // empty columns (MAX_PROBE_GAP). A directed caseColumn lookup (below) probes
+  // its exact cell separately, so a far scenario column is never missed.
   const numByRow = new Map();
   function numericsForRow(sheet, row) {
     const key = `${sheet}!${row}`;
     if (numByRow.has(key)) return numByRow.get(key);
     const vals = [];
-    const prefix = sheet + '!';
-    for (const [addr, v] of Object.entries(gt)) {
-      if (typeof v !== 'number') continue;
-      if (!addr.startsWith(prefix)) continue;
-      const cellPart = addr.substring(prefix.length);
-      const m = cellPart.match(/^([A-Z]+)(\d+)$/);
-      if (!m || parseInt(m[2], 10) !== row) continue;
-      vals.push({ col: m[1], addr, value: v });
+    let gap = 0;
+    for (let c = 1; c <= LABEL_PROBE_MAX_COL && gap < LABEL_PROBE_MAX_GAP; c++) {
+      const col = numToColLetters(c);
+      const addr = `${sheet}!${col}${row}`;
+      const v = gt[addr];
+      if (typeof v === 'number') { vals.push({ col, addr, value: v }); gap = 0; }
+      else gap++;
     }
     numByRow.set(key, vals);
     return vals;
@@ -465,6 +482,12 @@ export function searchByLabel(gt, pattern, options = {}) {
       const target = String(caseColumn).toUpperCase();
       const hit = adjacentValues.find(v => v.col === target);
       if (hit) caseValue = hit.value;
+      else {
+        // Directed lookup: probe the exact cell so a scenario column beyond the
+        // adjacent-block probe window is still resolved.
+        const direct = gt[`${c.sheet}!${target}${c.row}`];
+        if (typeof direct === 'number') caseValue = direct;
+      }
       adjacentValues = adjacentValues.slice().sort((a, b) => {
         if (a.col === target) return -1;
         if (b.col === target) return 1;
diff --git a/package.json b/package.json
index 4e00df7..438660f 100644
--- a/package.json
+++ b/package.json
@@ -41,7 +41,8 @@
     "test:engine": "node pipelines/rust/tests/test-engine-runtime.mjs",
     "test:depgraph": "node pipelines/rust/tests/test-dependency-graph.mjs",
     "test:slimming": "node tests/cli/test-artifact-slimming.mjs",
-    "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs"
+    "test": "node tests/lib/test-lib.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs",
+    "bench": "node benchmarks/bench.mjs"
   },
   "devDependencies": {}
 }
diff --git a/tests/cli/fixtures/cluster-model/chunked/_graph.json b/tests/cli/fixtures/cluster-model/chunked/_graph.json
new file mode 100644
index 0000000..c2082a3
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/_graph.json
@@ -0,0 +1,9 @@
+{
+  "topoOrder": ["SheetA", "SheetB"],
+  "sheetClusters": [["SheetA", "SheetB"]],
+  "edges": {
+    "SheetA!b": ["SheetB!d"],
+    "SheetB!c": ["SheetA!a", "SheetA!b"],
+    "SheetB!d": ["SheetB!c"]
+  }
+}
diff --git a/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json b/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json
new file mode 100644
index 0000000..14ab3d7
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json
@@ -0,0 +1 @@
+{"SheetA!a":50,"SheetA!b":50,"SheetB!c":100,"SheetB!d":100}
diff --git a/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs
new file mode 100644
index 0000000..97c20a8
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs
@@ -0,0 +1,8 @@
+// Synthetic circular-cluster fixture (SheetA ↔ SheetB) for per-sheet-eval tests.
+// SheetA reads SheetB!d; SheetB reads SheetA. Converges to a=50, b=50, c=100, d=100.
+export const SHEET_NAME = 'SheetA';
+export const SHEET_DEPENDENCIES = ['SheetB'];
+export function compute(ctx) {
+  ctx.set('SheetA!a', 50);                       // constant
+  ctx.set('SheetA!b', ctx.get('SheetB!d') / 2);  // reads across the cluster
+}
diff --git a/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs
new file mode 100644
index 0000000..576c849
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs
@@ -0,0 +1,7 @@
+// Synthetic circular-cluster fixture (SheetA ↔ SheetB) for per-sheet-eval tests.
+export const SHEET_NAME = 'SheetB';
+export const SHEET_DEPENDENCIES = ['SheetA'];
+export function compute(ctx) {
+  ctx.set('SheetB!c', ctx.get('SheetA!a') + ctx.get('SheetA!b')); // reads across the cluster
+  ctx.set('SheetB!d', ctx.get('SheetB!c'));
+}
diff --git a/tests/cli/test-per-sheet-eval.mjs b/tests/cli/test-per-sheet-eval.mjs
new file mode 100644
index 0000000..ea32d9c
--- /dev/null
+++ b/tests/cli/test-per-sheet-eval.mjs
@@ -0,0 +1,100 @@
+#!/usr/bin/env node
+/**
+ * Guard test for eval/per-sheet-eval.mjs.
+ *
+ * per-sheet-eval generates a temp wrapper module per sheet that imports the
+ * sheet's compute() by absolute path. On Windows, ESM rejects a bare absolute
+ * path ("C:\..."); it must be a file:// URL. That bug made EVERY sheet "crash"
+ * at load (0% accuracy) on Windows and on the real engines — and per-sheet-eval
+ * wasn't in CI, so it went unnoticed. This test runs the eval on the committed
+ * smoke engine and asserts no sheet crashed and accuracy is the known-good 100%,
+ * so the regression can't come back (CI runs it on ubuntu AND windows).
+ *
+ * Also exercises --skip-clusters (used by the model benchmark).
+ *
+ * Pure JS; uses the committed smoke chunked fixture (no parser needed).
+ */
+
+import { execFileSync } from 'child_process';
+import { mkdtempSync, cpSync, rmSync, existsSync, readFileSync } from 'fs';
+import { join, dirname } from 'path';
+import { tmpdir } from 'os';
+import { fileURLToPath } from 'url';
+
+const __dir = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(__dir, '..', '..');
+const SMOKE = join(ROOT, 'pipelines', 'rust', 'tests', 'output', 'chunked');
+const EVAL = join(ROOT, 'eval', 'per-sheet-eval.mjs');
+
+let passed = 0, failed = 0;
+const assert = (c, m) => { if (c) passed++; else { failed++; console.error(`  FAIL: ${m}`); } };
+
+if (!existsSync(join(SMOKE, 'engine.js')) || !existsSync(join(SMOKE, '_graph.json'))) {
+  console.log('SKIP: smoke chunked fixture not found');
+  process.exit(0);
+}
+
+// Copy the fixture to a temp dir so per-sheet-eval's _eval_tmp scratch never
+// touches the tracked fixture.
+const tmp = mkdtempSync(join(tmpdir(), 'pse-'));
+const chunked = join(tmp, 'chunked');
+cpSync(SMOKE, chunked, { recursive: true });
+
+function runEval(extraArgs) {
+  const out = join(tmp, `report-${extraArgs.join('') || 'base'}.json`);
+  try {
+    execFileSync('node', [EVAL, chunked, '--output', out, ...extraArgs],
+      { encoding: 'utf-8', stdio: 'pipe', maxBuffer: 32 * 1024 * 1024 });
+  } catch { /* nonzero exit handled via report inspection */ }
+  return existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null;
+}
+
+console.log('Testing: per-sheet-eval runs on the smoke engine (guards the Windows ESM-import fix)');
+{
+  const r = runEval([]);
+  assert(r !== null, 'report written');
+  if (r) {
+    assert(r.summary.sheetsEvaluated >= 3, `evaluated ≥3 sheets (got ${r.summary.sheetsEvaluated})`);
+    assert(r.summary.sheetsWithErrors === 0,
+      `no sheet crashed at import (errors: ${r.summary.sheetsWithErrors}) — guards the abs-path → file:// fix`);
+    assert(r.summary.overallAccuracy === 100, `smoke accuracy 100% (got ${r.summary.overallAccuracy})`);
+  }
+}
+
+console.log('Testing: --skip-clusters produces a report without error');
+{
+  const r = runEval(['--skip-clusters']);
+  assert(r !== null, '--skip-clusters report written');
+  if (r) assert(typeof r.summary.overallAccuracy === 'number', 'summary present with --skip-clusters');
+}
+
+console.log('Testing: circular-cluster convergence (synthetic SheetA↔SheetB fixture)');
+{
+  const CLUSTER = join(ROOT, 'tests', 'cli', 'fixtures', 'cluster-model', 'chunked');
+  if (existsSync(join(CLUSTER, '_graph.json'))) {
+    const ctmp = mkdtempSync(join(tmpdir(), 'pse-cl-'));
+    const cchunked = join(ctmp, 'chunked');
+    cpSync(CLUSTER, cchunked, { recursive: true });
+    const out = join(ctmp, 'report.json');
+    try {
+      execFileSync('node', [EVAL, cchunked, '--output', out], { encoding: 'utf-8', stdio: 'pipe', maxBuffer: 32 * 1024 * 1024 });
+    } catch { /* inspect report */ }
+    const r = existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null;
+    assert(r !== null, 'cluster report written');
+    if (r) {
+      assert(r.summary.sheetsEvaluated === 2, `both cluster sheets evaluated (got ${r.summary.sheetsEvaluated})`);
+      assert(r.summary.sheetsWithErrors === 0, `cluster converged without error (errors: ${r.summary.sheetsWithErrors})`);
+      // Converges to a=50,b=50,c=100,d=100 — exercises the convergence loop and
+      // the scoped (written-cells-only) convergence diff.
+      assert(r.summary.overallAccuracy === 100, `cluster fixture 100% via convergence (got ${r.summary.overallAccuracy})`);
+    }
+    rmSync(ctmp, { recursive: true, force: true });
+  } else {
+    console.log('  (skip: cluster fixture missing)');
+  }
+}
+
+rmSync(tmp, { recursive: true, force: true });
+console.log('');
+console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
+process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/lib/test-lib.mjs b/tests/lib/test-lib.mjs
new file mode 100644
index 0000000..72f2e0a
--- /dev/null
+++ b/tests/lib/test-lib.mjs
@@ -0,0 +1,136 @@
+#!/usr/bin/env node
+/**
+ * Unit tests for the shared financial libraries in lib/ — known-answer cases for
+ * the pure math (IRR/NPV/XIRR, the PE distribution waterfall) plus the
+ * calibration + sensitivity helpers. These had no direct coverage; CI now guards
+ * them on every push.
+ *
+ * Usage: node tests/lib/test-lib.mjs
+ */
+
+import { npv, npvDerivative, computeIRR, computeIRRBisection, computeXIRR } from '../../lib/irr.mjs';
+import {
+  computeWaterfall, createAmericanWaterfall, createEuropeanWaterfall, createMoicHurdleWaterfall,
+} from '../../lib/waterfall.mjs';
+import { getNestedValue, setNestedValue, validateOutputs } from '../../lib/calibration.mjs';
+import { flattenOutputs } from '../../lib/sensitivity.mjs';
+
+let passed = 0, failed = 0;
+function assert(cond, msg) { if (cond) { passed++; } else { failed++; console.error(`  FAIL: ${msg}`); } }
+function near(a, b, tol, msg) { assert(typeof a === 'number' && Math.abs(a - b) <= tol, `${msg} (got ${a}, want ≈${b} ±${tol})`); }
+
+// ── IRR / NPV / XIRR ─────────────────────────────────────────────────────────
+console.log('Testing: lib/irr.mjs');
+{
+  near(npv([-100, 110], 0.10), 0, 1e-9, 'npv zero at the break-even rate');
+  near(npv([100], 0.5), 100, 1e-12, 'npv of a single t=0 flow is itself');
+  near(npv([0, 100], 0), 100, 1e-12, 'npv at rate 0 is the undiscounted sum');
+  assert(npvDerivative([-100, 110], 0.10) < 0, 'npv decreases as rate rises (negative derivative)');
+
+  near(computeIRR([-100, 150]), 0.50, 1e-4, 'IRR of -100 → +150 is 50%');
+  near(computeIRR([-100, 110]), 0.10, 1e-4, 'IRR of -100 → +110 is 10%');
+  near(computeIRR([-1000, 200, 200, 200, 200, 200, 200, 200, 200]), 0.1189, 1e-3, 'IRR of -1000 then 200×8 ≈ 11.89%');
+  near(computeIRR([-1000, 0, 0, 1100]), 0.03228, 1e-3, 'IRR of -1000 → +1100 in 3y ≈ 3.23%');
+  // NPV(IRR) ≈ 0 sanity for a found rate
+  const r = computeIRR([-500, 100, 200, 300]);
+  near(npv([-500, 100, 200, 300], r), 0, 1e-5, 'NPV at the solved IRR is ~0');
+
+  assert(computeIRR([10, 20]) === null, 'no sign change → null IRR');
+  assert(computeIRR([-10]) === null, 'single flow → null IRR');
+
+  near(computeIRRBisection([-100, 150]), 0.50, 1e-4, 'bisection agrees: 50%');
+  near(computeIRRBisection([-1000, 200, 200, 200, 200, 200, 200, 200, 200]), 0.1189, 1e-3, 'bisection agrees: ~11.89%');
+
+  const xirr = computeXIRR([
+    { date: new Date('2020-01-01'), amount: -1000 },
+    { date: new Date('2021-01-01'), amount: 1100 },
+  ]);
+  near(xirr, 0.10, 2e-3, 'XIRR of -1000 → +1100 one year later ≈ 10%');
+}
+
+// ── Waterfall ────────────────────────────────────────────────────────────────
+console.log('Testing: lib/waterfall.mjs');
+{
+  // Standard American 80/20, 8% pref, full catch-up. 200M proceeds on 100M
+  // equity, 1y hold (simple pref = 100M × 8% = 8M).
+  const american = createAmericanWaterfall({ prefReturn: 0.08, carryPercent: 0.20, residualLPSplit: 0.80, hasCatchup: true });
+  assert(american.length === 4, `American (with catch-up) has 4 tiers (got ${american.length})`);
+  const w = computeWaterfall(200_000_000, 100_000_000, american, { holdPeriodYears: 1 });
+  near(w.totalDistributed, 200_000_000, 1, 'all proceeds distributed');
+  near(w.gpTotal, 34_400_000, 1, 'GP carry = catch-up 20M + residual 14.4M = 34.4M');
+  near(w.lpTotal, 165_600_000, 1, 'LP = ROC 100M + pref 8M + residual 57.6M = 165.6M');
+  near(w.lpTotal + w.gpTotal, w.totalDistributed, 1, 'conservation: LP + GP = distributed');
+  near(w.gpCarryPercent, 0.344, 1e-4, 'GP carry % of profit = 34.4%');
+  near(w.undistributed, 0, 1, 'nothing left undistributed');
+
+  // No catch-up variant has one fewer tier and less GP.
+  const noCatchup = createAmericanWaterfall({ prefReturn: 0.08, carryPercent: 0.20, residualLPSplit: 0.80, hasCatchup: false });
+  assert(noCatchup.length === 3, `American (no catch-up) has 3 tiers (got ${noCatchup.length})`);
+  const w2 = computeWaterfall(200_000_000, 100_000_000, noCatchup, { holdPeriodYears: 1 });
+  assert(w2.gpTotal < w.gpTotal, 'no-catch-up GP carry is lower than with catch-up');
+  near(w2.lpTotal + w2.gpTotal, w2.totalDistributed, 1, 'conservation holds (no catch-up)');
+
+  // Loss case: proceeds below equity → no carry, LP gets everything available.
+  const loss = computeWaterfall(80_000_000, 100_000_000, american, { holdPeriodYears: 1 });
+  near(loss.gpTotal, 0, 1, 'no GP carry on a loss');
+  near(loss.lpTotal, 80_000_000, 1, 'LP receives all proceeds on a loss');
+  near(loss.gpCarryPercent, 0, 1e-9, 'GP carry % is 0 when there is no profit');
+
+  // Flat MOIC hurdle (no IRR pref): 1.40x hurdle, 20% promote, 2.0x MOIC.
+  const moic = createMoicHurdleWaterfall({ hurdleMOIC: 1.40, carryPercent: 0.20 });
+  const w3 = computeWaterfall(200_000_000, 100_000_000, moic);
+  near(w3.gpTotal, 12_000_000, 1, 'MOIC-hurdle GP = 20% × (200M − 140M) = 12M');
+  near(w3.lpTotal, 188_000_000, 1, 'MOIC-hurdle LP = 188M');
+  // The flat MOIC hurdle must NOT move with hold period (documented invariant).
+  const w3long = computeWaterfall(200_000_000, 100_000_000, moic, { holdPeriodYears: 10 });
+  near(w3long.gpTotal, w3.gpTotal, 1, 'flat MOIC hurdle is hold-period-independent');
+
+  // European builder produces a usable, ordered tier set.
+  const euro = createEuropeanWaterfall([
+    { hurdle: 0.08, carry: 0.00 }, { hurdle: 0.12, carry: 0.20 }, { hurdle: Infinity, carry: 0.30 },
+  ]);
+  assert(euro[0].type === 'return_of_capital', 'European waterfall starts with return of capital');
+  const w4 = computeWaterfall(150_000_000, 100_000_000, euro, { holdPeriodYears: 1 });
+  near(w4.lpTotal + w4.gpTotal, w4.totalDistributed, 1, 'conservation holds (European)');
+}
+
+// ── Calibration helpers ──────────────────────────────────────────────────────
+console.log('Testing: lib/calibration.mjs');
+{
+  assert(getNestedValue({ a: { b: { c: 42 } } }, 'a.b.c') === 42, 'getNestedValue reads a deep path');
+  assert(getNestedValue({ a: {} }, 'a.b.c') === undefined, 'getNestedValue returns undefined for a missing path');
+
+  const obj = {};
+  setNestedValue(obj, 'x.y.z', 5);
+  assert(obj.x && obj.x.y && obj.x.y.z === 5, 'setNestedValue creates intermediate objects');
+
+  const v = validateOutputs(
+    { returns: { moic: 2.00, irr: 0.20 } },
+    [{ key: 'returns.moic', excelValue: 2.00 }, { key: 'returns.irr', excelValue: 0.25 }],
+    { tolerance: 0.01 },
+  );
+  assert(v.totalCount === 2, 'validateOutputs reports total count');
+  assert(v.passCount === 1 && v.failCount === 1, 'validateOutputs: moic passes, irr (0.20 vs 0.25) fails at 1% tol');
+  assert(v.allPassed === false, 'validateOutputs.allPassed false when any fails');
+  const irrRes = v.results.find(r => r.key === 'returns.irr');
+  near(irrRes.suggestedFactor, 0.25 / 0.20, 1e-9, 'validateOutputs suggests the corrective factor');
+}
+
+// ── Sensitivity helpers ──────────────────────────────────────────────────────
+console.log('Testing: lib/sensitivity.mjs');
+{
+  const flat = flattenOutputs({
+    returns: { moic: 2.0, irr: 0.2, label: 'skip-strings' },
+    waterfall: { gpCarry: 1_000_000 },
+    ignoredGroup: { x: 99 },
+  });
+  assert(flat['returns.moic'] === 2.0 && flat['returns.irr'] === 0.2, 'flattenOutputs flattens numeric outputs');
+  assert(flat['waterfall.gpCarry'] === 1_000_000, 'flattenOutputs includes the waterfall group');
+  assert(!('returns.label' in flat), 'flattenOutputs drops non-numeric values');
+  assert(!('ignoredGroup.x' in flat), 'flattenOutputs only includes known output groups');
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+console.log('');
+console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
+process.exit(failed > 0 ? 1 : 0);