From 40e42011cd3871e5e8350ba585f06d16f6d54a14 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 17:16:53 -0600
Subject: [PATCH 1/8] feat(bench): Outpost accuracy benchmark + per-sheet-eval
 fixes + searchByLabel lazy numerics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keystone of the next-wave effort: a repeatable accuracy + efficacy benchmark
over the real ~200 MB Outpost models, plus fixes to the eval tooling that was
silently broken on them.

Benchmark (benchmarks/outpost-bench.mjs, `npm run bench:outpost`):
- Wraps eval/per-sheet-eval.mjs (live engine-vs-ground-truth) per model;
  reports overall accuracy + per-sheet pass/skip + timings. Aggregate-only
  results -> committed benchmarks/BASELINE.md; full detail -> gitignored
  benchmarks/results/. No cell value/label is ever committed.
- Baseline: outpost-a1 84.3%, outpost-a2 85.5% on standalone sheets (17-sheet
  cluster + 190 MB PP&E skipped for now).

per-sheet-eval (wasn't in CI -> bugs went unnoticed):
- Windows crash FIXED: it imported each sheet's compute() by a bare absolute
  path ("C:\..."), which Node ESM rejects on Windows -> every sheet crashed at
  load (0% accuracy) on Windows + the real engines. Now uses pathToFileURL().
  New tests/cli/test-per-sheet-eval.mjs (6) guards it; CI runs on windows-latest.
- --skip-clusters flag: record circular-cluster sheets as skipped (the current
  convergence re-runs the whole cluster once per member -> O(cluster²),
  infeasible on big models) pending the single-pass orchestrator eval.

searchByLabel (query/carry): probe the matched row's columns on demand instead
of scanning the whole GT per row, with a directed caseColumn probe so a far
scenario column is never missed. Behavior-preserving.

Findings (now in ROADMAP): the cluster is 17/21 sheets + redundantly evaluated;
the 190 MB PP&E sheet exceeds the 150 MB limit; _computed-values.json is a
byte-identical GT copy (not an accuracy source).

gitignore: engines/ (already), benchmarks/results/, _eval_tmp/.

Full `npm test` (incl. new per-sheet-eval guard) + smoke green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore                        |   8 ++
 CHANGELOG.md                      |  49 +++++++++
 PLAN.md                           |  24 +++++
 ROADMAP.md                        |  33 ++++--
 benchmarks/BASELINE.md            |  24 +++++
 benchmarks/outpost-bench.mjs      | 172 ++++++++++++++++++++++++++++++
 eval/per-sheet-eval.mjs           |  21 +++-
 lib/manifest.mjs                  |  41 +++++--
 package.json                      |   3 +-
 tests/cli/test-per-sheet-eval.mjs |  74 +++++++++++++
 10 files changed, 426 insertions(+), 23 deletions(-)
 create mode 100644 benchmarks/BASELINE.md
 create mode 100644 benchmarks/outpost-bench.mjs
 create mode 100644 tests/cli/test-per-sheet-eval.mjs

diff --git a/.gitignore b/.gitignore
index 8c6cf07..5735539 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,14 @@ tests/eval-results.json
 # covered by *.xlsx above; this also excludes the parsed engine output dirs.
 engines/
 
+# Benchmark run detail (per-sheet/per-cell results derived from the real models
+# — may contain real values/labels). Only the aggregate-only benchmarks/BASELINE.md
+# is committed; raw per-run detail stays local.
+benchmarks/results/
+
+# per-sheet-eval scratch dir (transient child-process scripts + per-sheet GT)
+_eval_tmp/
+
 # Transient test artifacts (scenario save/load test writes here on every run)
 tests/cli/fixtures/scenarios/
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0bffafc..903078c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,54 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Outpost accuracy benchmark + eval-tooling fixes
+
+Stood up a repeatable accuracy + efficacy benchmark over the real ~200 MB
+Outpost models so improvements can be tracked over time, and fixed the eval
+tooling that was silently broken on them.
+
+### Benchmark (`benchmarks/outpost-bench.mjs`, `npm run bench:outpost`)
+
+- Wraps `eval/per-sheet-eval.mjs` (live engine-vs-ground-truth) for every model
+  under a root dir; reports overall accuracy, per-sheet pass/skip counts, and
+  timings. **Aggregate-only** results go to the committed `benchmarks/BASELINE.md`;
+  full per-sheet detail stays in the gitignored `benchmarks/results/`. No cell
+  value or label is ever committed.
+- **Baseline (2026-05-28):** outpost-a1 **84.3%**, outpost-a2 **85.5%** on the
+  standalone sheets. (The 17-sheet circular cluster and the 190 MB PP&E sheet are
+  skipped for now — see below.)
+
+### per-sheet-eval fixes (it wasn't in CI, so these went unnoticed)
+
+- **Windows crash fixed.** The generated per-sheet wrapper imported each sheet's
+  `compute()` by a bare absolute path (`"C:\\..."`), which Node ESM rejects on
+  Windows — so *every* sheet "crashed" at load (0% accuracy) on Windows and on
+  the real engines. Now uses `pathToFileURL()`. New `tests/cli/test-per-sheet-eval.mjs`
+  (6) guards it; CI runs it on **windows-latest** too.
+- **`--skip-clusters`** flag: record circular-cluster sheets as skipped instead
+  of evaluating them. The current convergence path re-runs the *whole* cluster
+  once per member sheet (O(cluster²)), which is infeasible on big models; this
+  yields a fast, real number for the standalone sheets while the single-pass
+  orchestrator eval is built (ROADMAP).
+
+### searchByLabel: lazy numerics (query / carry)
+
+`searchByLabel` previously scanned the entire ground truth once per matched row
+to collect adjacent numerics. It now probes the row's columns on demand (same
+approach as the refiner), with a directed `caseColumn` lookup probing its exact
+cell so a far scenario column is never missed. Behavior-preserving (query/carry/
+ai-interface suites green).
+
+### Findings that scope the accuracy-blocker work
+
+- The 190 MB PP&E sheet exceeds the 150 MB per-sheet limit → **large-sheet eval**
+  blocker confirmed.
+- The circular cluster is **17 of 21 sheets** and is evaluated redundantly
+  (once per member) → the concrete reason behind "circular-cluster won't
+  evaluate." Single-pass orchestrator eval is the fix.
+- `_computed-values.json` in these engines is **byte-identical to ground truth**
+  (a seeded copy), so it is not a valid accuracy source — accuracy must come from
+  live recompute.
+
 ## 2026-05-28 — `init` parses the ground truth once (shared across the pipeline)
 
 The real driver behind the "~2.5 min" refine loop wasn't one command — it was
diff --git a/PLAN.md b/PLAN.md
index 8d36e2a..f16b337 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -1,5 +1,29 @@
 # excel-to-engine — Plan
 
+## Status: Outpost accuracy benchmark + eval fixes — in progress 2026-05-28
+
+Standing up the multi-wave "next wave" effort on `feat/next-wave`, keystone
+first: a repeatable accuracy + efficacy benchmark over the real Outpost models
+(`benchmarks/outpost-bench.mjs` → `benchmarks/BASELINE.md`, aggregate-only).
+
+**Baseline:** outpost-a1 84.3%, outpost-a2 85.5% on standalone sheets; the
+17-sheet circular cluster and the 190 MB PP&E sheet are skipped pending deeper
+fixes. Landed alongside: a **Windows crash fix** in `per-sheet-eval` (bare
+absolute ESM import → `pathToFileURL`; it had silently zeroed accuracy on
+Windows/real engines and wasn't in CI — now guarded by `test-per-sheet-eval`),
+a `--skip-clusters` flag, and the **searchByLabel lazy-numerics** wave
+(query/carry stop scanning the full GT for adjacent values).
+
+**Wave status (this branch):**
+- ✅ Keystone benchmark + baseline; ✅ searchByLabel (query/carry).
+- 🔜 Accuracy blockers — now precisely diagnosed: single-pass orchestrator eval
+  for the 17-sheet cluster (it's re-run once per member today), large-sheet eval
+  (190 MB PP&E > 150 MB limit), array formulas (the Headcount sheet lives inside
+  the cluster). `_computed-values.json` is a GT copy — not an accuracy source.
+- 🔜 Manifest-pipeline perf (generate detectors / maps cell-types on ~6M cells).
+- 🔜 Polish→Publish (lib/ unit tests, npm publish prep, example project,
+  contributing guide).
+
 ## Status: single GT parse per init — landed 2026-05-28
 
 `ete init` now loads the ground truth once and shares the parsed object across
diff --git a/ROADMAP.md b/ROADMAP.md
index c6786bb..86be414 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -96,8 +96,10 @@ when we next touch the monitor server or auth surface.
     ~70 KB) but extracting it cheaply would couple the parser to refine's metric
     vocabulary. Not worth it on these models; revisit only if a genuinely
     giant-grid model (mostly unlabeled numeric grids) shows up.
-  - **Still open:** apply the same lazy-numerics path to `searchByLabel`
-    (`query` / `carry`) so they stop scanning the GT for adjacent values.
+  - **Done (2026-05-28):** applied the same lazy-numerics path to `searchByLabel`
+    (`query` / `carry`) — probes the matched row's columns instead of scanning
+    the whole GT, with a directed `caseColumn` probe so a far scenario column is
+    never missed.
 - Manifest migration tooling for model updates (vN → vN+1 shape diff).
 
 ---
@@ -147,14 +149,29 @@ when we next touch the monitor server or auth surface.
 - **Pref compounding for long holds** — 12-year 8% compound pref = 2.52x hurdle, which exceeds many MOIC targets. Need to detect when models use quarterly cash flow waterfalls vs bullet maturity and adjust accordingly.
 
 ### Eval System
-- Increase blind eval question diversity (computed questions, cross-sheet aggregations)
-- Add time-period-aware questions ("What was X in Q3 2025?")
-- Profile and optimize per-sheet eval for sheets >150MB
+- **Done (2026-05-28):** repeatable accuracy + efficacy benchmark over the real
+  Outpost models — `benchmarks/outpost-bench.mjs` → `benchmarks/BASELINE.md`
+  (aggregate-only). Baseline: a1 84.3%, a2 85.5% on standalone sheets. Also
+  **fixed a Windows crash** in `per-sheet-eval` (bare absolute ESM import →
+  `pathToFileURL`; it had zeroed accuracy on Windows/real engines and wasn't in
+  CI — now guarded by `test-per-sheet-eval`, run on windows-latest).
+- **Large-sheet eval (190 MB PP&E):** confirmed it exceeds the 150 MB per-sheet
+  limit and is skipped. Needs streaming/sharded per-sheet eval or a higher limit
+  with chunked compute. The standalone sheets at ~85% also need attention (array
+  formulas / wide-sheet disambiguation) — visible now that the eval runs.
+- Increase blind eval question diversity; add time-period-aware questions.
 
 ### Convergence Loop Accuracy
-- The 62-sheet circular cluster in the large model is the biggest accuracy blocker
-- Investigate running eval through the orchestrator (not per-sheet isolation) for circular sheets
-- Consider lazy subgraph evaluation (only compute transitive closure of target cells)
+- **Diagnosed (2026-05-28):** on the real models the circular cluster is **17 of
+  21 sheets**, and `per-sheet-eval` re-runs the *entire* cluster convergence once
+  per member sheet (O(cluster²)) — that's why clustered big models "won't
+  evaluate." The array-formula Headcount sheet lives inside this cluster, so it's
+  unmeasurable until this is fixed. `--skip-clusters` skips them for now.
+- **Fix:** single-pass orchestrator eval — converge the cluster once, then score
+  every member sheet from that converged state (then drop `--skip-clusters` from
+  the benchmark). Also scope the convergence diff to written cells (it currently
+  diffs all ~6M seeded cells per iteration).
+- Consider lazy subgraph evaluation (only compute transitive closure of targets).
 
 ## Near-Term
 
diff --git a/benchmarks/BASELINE.md b/benchmarks/BASELINE.md
new file mode 100644
index 0000000..e2a72bd
--- /dev/null
+++ b/benchmarks/BASELINE.md
@@ -0,0 +1,24 @@
+# Outpost benchmark — baseline & history
+
+Real accuracy: each standalone sheet recomputed live vs ground truth via
+`eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).
+Circular-cluster sheets and oversized sheets are **skipped** for now (see
+the Skipped column + blockers below) pending the single-pass orchestrator
+eval; run with `--with-clusters` once that lands. Aggregate-only — no cell
+values or full sheet inventory. Regenerate:
+`node benchmarks/outpost-bench.mjs --root <engines>`. Full per-sheet detail
+lands in the gitignored `benchmarks/results/`.
+
+_Last run: baseline-2026-05-28_
+
+| Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |
+|-------|---------:|------:|:-----------:|:-------:|----------:|---:|
+| outpost-a1 | 84.33% | 1491/1768 | 1/3 | 17 | 45s | 201.5 MB |
+| outpost-a2 | 85.54% | 1686/1971 | 2/4 | 17 | 48s | 211 MB |
+
+## Known blocker categories
+
+Tracked by name because PLAN.md already calls them out; values are accuracy %, not financials.
+
+- **outpost-a1**: 1/3 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
+- **outpost-a2**: 2/4 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
diff --git a/benchmarks/outpost-bench.mjs b/benchmarks/outpost-bench.mjs
new file mode 100644
index 0000000..23f8187
--- /dev/null
+++ b/benchmarks/outpost-bench.mjs
@@ -0,0 +1,172 @@
+#!/usr/bin/env node
+/**
+ * Outpost benchmark — repeatable accuracy + efficacy tracking over the real
+ * models, so we can see whether each improvement actually moves the needle.
+ *
+ * It wraps `eval/per-sheet-eval.mjs` (which runs each sheet module live against
+ * ground truth, converges circular clusters, and skips/handles oversized
+ * sheets) for every model under a root dir, then aggregates. Accuracy is real
+ * (engine recompute vs ground truth, 1% rel. tol / exact strings) — NOT the
+ * `_computed-values.json` snapshot, which is a copy of ground truth (trivially
+ * 100%), nor the full-engine `run()`, which is infeasible on these models (the
+ * 190 MB PP&E sheet).
+ *
+ * Privacy: the real models are proprietary (gitignored). Full per-sheet detail
+ * (incl. per-cell failures) stays in the gitignored `benchmarks/results/`. Only
+ * AGGREGATE, non-identifying metrics — overall accuracy, sheet pass/skip counts,
+ * timings, and the already-public blocker categories (PP&E, Headcount) — go to
+ * the committed `benchmarks/BASELINE.md`. No cell value or label is ever
+ * printed or committed.
+ *
+ * Usage:
+ *   node benchmarks/outpost-bench.mjs [--root <dir>] [--concurrency 3] [--stamp <label>]
+ *
+ * Run it after any change that could affect accuracy or pipeline speed, then
+ * diff benchmarks/BASELINE.md to see the delta.
+ */
+
+import { readFileSync, writeFileSync, existsSync, statSync, readdirSync, mkdirSync } from 'fs';
+import { join, resolve, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { execFile } from 'child_process';
+import { promisify } from 'util';
+
+const execFileAsync = promisify(execFile);
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const REPO_ROOT = resolve(__dirname, '..');
+const EVAL_SCRIPT = join(REPO_ROOT, 'eval', 'per-sheet-eval.mjs');
+const RESULTS_DIR = join(__dirname, 'results');
+
+function flag(name, fallback) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i >= 0 && process.argv[i + 1] ? process.argv[i + 1] : fallback;
+}
+
+const ROOT = resolve(flag('root', join(REPO_ROOT, 'engines')));
+const CONCURRENCY = flag('concurrency', '3');
+const STAMP = flag('stamp', new Date().toISOString().replace(/[:.]/g, '-'));
+
+function discoverModels(root) {
+  if (!existsSync(root)) return [];
+  const out = [];
+  for (const name of readdirSync(root)) {
+    const chunked = join(root, name, 'chunked');
+    if (existsSync(join(chunked, 'engine.js')) && existsSync(join(chunked, '_ground-truth.json'))) {
+      out.push({ name, chunked });
+    }
+  }
+  return out.sort((a, b) => a.name.localeCompare(b.name));
+}
+
+async function benchModel(model) {
+  mkdirSync(RESULTS_DIR, { recursive: true });
+  const reportPath = join(RESULTS_DIR, `persheet-${model.name}-${STAMP}.json`);
+  const gtSizeMB = +(statSync(join(model.chunked, '_ground-truth.json')).size / 1e6).toFixed(1);
+
+  const t = Date.now();
+  let evalError = null;
+  try {
+    // --skip-clusters by default: circular-cluster sheets need the single-pass
+    // orchestrator eval (a follow-up); evaluating them per-sheet is infeasible on
+    // big models. Pass --with-clusters once that lands. Standalone sheets give a
+    // fast, real accuracy number; skipped sheets are reported with their reason.
+    const evalArgs = ['--max-old-space-size=8192', EVAL_SCRIPT, model.chunked, '--output', reportPath, '--concurrency', String(CONCURRENCY)];
+    if (!process.argv.includes('--with-clusters')) evalArgs.push('--skip-clusters');
+    await execFileAsync('node', evalArgs, { maxBuffer: 64 * 1024 * 1024 });
+  } catch (e) {
+    evalError = (e.stderr || e.message || String(e)).slice(0, 200);
+  }
+  const wallMs = Date.now() - t;
+
+  if (!existsSync(reportPath)) {
+    return { name: model.name, evalError: evalError || 'no report produced', efficacy: { wallMs, gtSizeMB } };
+  }
+  const report = JSON.parse(readFileSync(reportPath, 'utf-8'));
+  // Strip topFailures (real cell values) from anything we keep in memory for the
+  // committed summary; the gitignored report file retains full detail.
+  const sheets = (report.sheets || [])
+    .map(s => ({ name: s.name, status: s.status, accuracy: s.accuracy, correct: s.correct, total: s.total }))
+    .sort((a, b) => a.accuracy - b.accuracy);
+
+  return {
+    name: model.name,
+    evalError,
+    summary: report.summary,
+    skipped: report.skipped || [],
+    sheets,
+    efficacy: { wallMs, gtSizeMB },
+  };
+}
+
+// ── Run ──────────────────────────────────────────────────────────────────────
+const models = discoverModels(ROOT);
+if (models.length === 0) {
+  console.error(`No models found under ${ROOT} (need <model>/chunked/engine.js + _ground-truth.json).`);
+  console.error('Point --root at a dir of parsed engines (the real Outpost engines live in the gitignored engines/).');
+  process.exit(2);
+}
+
+console.log(`Outpost benchmark — ${models.length} model(s) under ${ROOT}\n`);
+const results = [];
+for (const m of models) {
+  process.stdout.write(`  ${m.name} ... `);
+  const r = await benchModel(m);
+  results.push(r);
+  if (r.evalError && !r.summary) console.log(`eval FAILED (${r.evalError})`);
+  else {
+    const s = r.summary;
+    console.log(`acc ${s.overallAccuracy}%  (${s.totalCellsCorrect}/${s.totalCellsTested})  ` +
+      `${s.sheetsPassing}/${s.sheetsEvaluated} sheets ≥95%, ${s.sheetsSkipped} skipped  [${(r.efficacy.wallMs / 1000).toFixed(0)}s]`);
+  }
+}
+
+mkdirSync(RESULTS_DIR, { recursive: true });
+const detailPath = join(RESULTS_DIR, `summary-${STAMP}.json`);
+writeFileSync(detailPath, JSON.stringify({ stamp: STAMP, root: ROOT, results }, null, 2));
+console.log(`\nDetail (gitignored): ${detailPath}`);
+
+// ── Committed aggregate (no values, no full sheet inventory) ──────────────────
+function renderBaseline(stamp, results) {
+  const L = [];
+  L.push('# Outpost benchmark — baseline & history');
+  L.push('');
+  L.push('Real accuracy: each standalone sheet recomputed live vs ground truth via');
+  L.push('`eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).');
+  L.push('Circular-cluster sheets and oversized sheets are **skipped** for now (see');
+  L.push('the Skipped column + blockers below) pending the single-pass orchestrator');
+  L.push('eval; run with `--with-clusters` once that lands. Aggregate-only — no cell');
+  L.push('values or full sheet inventory. Regenerate:');
+  L.push('`node benchmarks/outpost-bench.mjs --root <engines>`. Full per-sheet detail');
+  L.push('lands in the gitignored `benchmarks/results/`.');
+  L.push('');
+  L.push(`_Last run: ${stamp}_`);
+  L.push('');
+  L.push('| Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |');
+  L.push('|-------|---------:|------:|:-----------:|:-------:|----------:|---:|');
+  for (const r of results) {
+    if (!r.summary) { L.push(`| ${r.name} | eval failed | — | — | — | — | ${r.efficacy.gtSizeMB} MB |`); continue; }
+    const s = r.summary;
+    L.push(`| ${r.name} | ${s.overallAccuracy}% | ${s.totalCellsCorrect}/${s.totalCellsTested} | ` +
+      `${s.sheetsPassing}/${s.sheetsEvaluated} | ${s.sheetsSkipped} | ${(r.efficacy.wallMs / 1000).toFixed(0)}s | ${r.efficacy.gtSizeMB} MB |`);
+  }
+  L.push('');
+  L.push('## Known blocker categories');
+  L.push('');
+  L.push('Tracked by name because PLAN.md already calls them out; values are accuracy %, not financials.');
+  L.push('');
+  const PUBLIC = [/pp&?e/i, /headcount/i];
+  for (const r of results) {
+    if (!r.summary) { L.push(`- **${r.name}**: eval failed`); continue; }
+    const blockers = [];
+    for (const sk of r.skipped) if (PUBLIC.some(re => re.test(sk.name))) blockers.push(`${sk.name} (skipped: ${sk.reason})`);
+    for (const sh of r.sheets) if (PUBLIC.some(re => re.test(sh.name))) blockers.push(`${sh.name} ${sh.accuracy}%`);
+    const lowest = r.sheets.filter(s => s.status !== 'ok' || s.accuracy < 95).length;
+    L.push(`- **${r.name}**: ${r.summary.sheetsEvaluated - lowest}/${r.summary.sheetsEvaluated} sheets clean; ` +
+      `blockers: ${blockers.join('; ') || 'none surfaced'}`);
+  }
+  L.push('');
+  return L.join('\n');
+}
+const baselinePath = join(__dirname, 'BASELINE.md');
+writeFileSync(baselinePath, renderBaseline(STAMP, results));
+console.log(`Baseline (committed): ${baselinePath}`);
diff --git a/eval/per-sheet-eval.mjs b/eval/per-sheet-eval.mjs
index e3339f3..6111cce 100644
--- a/eval/per-sheet-eval.mjs
+++ b/eval/per-sheet-eval.mjs
@@ -16,7 +16,7 @@
 import { readFile, writeFile, mkdir, stat, readdir, unlink } from 'fs/promises';
 import { existsSync } from 'fs';
 import { join, resolve, basename, dirname } from 'path';
-import { fileURLToPath } from 'url';
+import { fileURLToPath, pathToFileURL } from 'url';
 import { execFile } from 'child_process';
 import { promisify } from 'util';
 
@@ -36,6 +36,12 @@ function getFlag(name, fallback) {
 const OUTPUT_FILE = getFlag('output', join(chunkedDir, '..', 'per-sheet-report.json'));
 const CONCURRENCY = parseInt(getFlag('concurrency', process.env.EVAL_CONCURRENCY || '6'));
 const SAMPLE_SIZE = parseInt(getFlag('sample', process.env.SAMPLE_SIZE || '2000'));
+// --skip-clusters: record circular-cluster sheets as skipped instead of
+// evaluating them. The current convergence path re-runs the whole cluster once
+// per member sheet (O(cluster²) work), which is infeasible on big models; this
+// flag yields a fast, real accuracy number for the standalone sheets while the
+// single-pass orchestrator eval is built. See ROADMAP (circular-cluster eval).
+const SKIP_CLUSTERS = args.includes('--skip-clusters');
 const NODE_HEAP_MB = parseInt(process.env.NODE_HEAP_MB || '8192');
 const MAX_SHEET_SIZE_MB = parseInt(process.env.MAX_SHEET_SIZE_MB || '150');
 
@@ -139,6 +145,11 @@ async function main() {
       continue;
     }
 
+    if (SKIP_CLUSTERS && clusterSheetSet.has(entry.name)) {
+      skipped.push({ name: entry.name, reason: 'circular cluster (--skip-clusters; needs single-pass orchestrator eval)' });
+      continue;
+    }
+
     // Sample ground truth if sheet has too many entries
     let sampleGt = entry.gt;
     if (entry.totalCount > SAMPLE_SIZE) {
@@ -185,15 +196,15 @@ async function main() {
     const cluster = sheetClusters.find(c => c.includes(sheetName));
     const clusterModules = cluster ? cluster.map(s => {
       const san = s.replace(/[^a-zA-Z0-9]/g, '_');
-      const modPath = join(sheetsDir, `${san}.mjs`).replace(/\\/g, '/');
+      const modPath = join(sheetsDir, `${san}.mjs`);
       return { name: s, sanitized: san, path: modPath };
-    }).filter(m => existsSync(join(sheetsDir, `${m.sanitized}.mjs`))) : [];
+    }).filter(m => existsSync(m.path)) : [];
 
     // Build a child process script that loads the sheet module(s) and compares.
     // Paths flow into JS source — interpolate them as JSON-quoted strings so a
     // path containing `'` or `\` can't break out and inject code.
     const clusterImports = clusterModules.length > 0
-      ? clusterModules.map(m => `import { compute as compute_${m.sanitized} } from ${JSON.stringify(m.path)};`).join('\n')
+      ? clusterModules.map(m => `import { compute as compute_${m.sanitized} } from ${JSON.stringify(pathToFileURL(m.path).href)};`).join('\n')
       : '';
     const clusterComputeBlock = clusterModules.length > 0
       ? `
@@ -226,7 +237,7 @@ async function main() {
 
     const evalScript = `
 import { readFile } from 'fs/promises';
-import { compute } from ${JSON.stringify(modulePath.replace(/\\/g, '/'))};
+import { compute } from ${JSON.stringify(pathToFileURL(modulePath).href)};
 ${clusterImports}
 
 const allGt = JSON.parse(await readFile(${JSON.stringify(gtFullPath.replace(/\\/g, '/'))}, 'utf8'));
diff --git a/lib/manifest.mjs b/lib/manifest.mjs
index e234e8f..119f1d1 100644
--- a/lib/manifest.mjs
+++ b/lib/manifest.mjs
@@ -374,6 +374,19 @@ export function loadLabelIndex(modelDir) {
   return null;
 }
 
+// Column-probe bounds for numericsForRow (label search). Excel's hard ceiling
+// is XFD (16384); we stop after a long run of empty columns so a label-only row
+// costs a few hundred O(1) lookups instead of a full ground-truth scan.
+const LABEL_PROBE_MAX_COL = 16384;
+const LABEL_PROBE_MAX_GAP = 256;
+
+// 1 → "A", 26 → "Z", 27 → "AA". Inverse of the col parsing in buildLabelIndex.
+function numToColLetters(num) {
+  let col = '';
+  while (num > 0) { const r = (num - 1) % 26; col = String.fromCharCode(65 + r) + col; num = Math.floor((num - 1) / 26); }
+  return col;
+}
+
 /**
  * Search ground truth for cells matching a label pattern.
  * Returns matching labels with adjacent numeric values.
@@ -439,20 +452,24 @@ export function searchByLabel(gt, pattern, options = {}) {
   }
 
   // For each candidate, collect adjacent numeric values on the same row.
-  // Index adjacent numerics by (sheet, row) for efficient lookup on repeated rows.
+  // Probe the row's columns on demand (memoized) rather than scanning the whole
+  // ground truth once per row — the same approach the refiner uses. On a 200 MB
+  // ground truth a full scan per matched row is ~tens of ms each; probing the
+  // contiguous numeric block is effectively free. Stops after a long run of
+  // empty columns (MAX_PROBE_GAP). A directed caseColumn lookup (below) probes
+  // its exact cell separately, so a far scenario column is never missed.
   const numByRow = new Map();
   function numericsForRow(sheet, row) {
     const key = `${sheet}!${row}`;
     if (numByRow.has(key)) return numByRow.get(key);
     const vals = [];
-    const prefix = sheet + '!';
-    for (const [addr, v] of Object.entries(gt)) {
-      if (typeof v !== 'number') continue;
-      if (!addr.startsWith(prefix)) continue;
-      const cellPart = addr.substring(prefix.length);
-      const m = cellPart.match(/^([A-Z]+)(\d+)$/);
-      if (!m || parseInt(m[2], 10) !== row) continue;
-      vals.push({ col: m[1], addr, value: v });
+    let gap = 0;
+    for (let c = 1; c <= LABEL_PROBE_MAX_COL && gap < LABEL_PROBE_MAX_GAP; c++) {
+      const col = numToColLetters(c);
+      const addr = `${sheet}!${col}${row}`;
+      const v = gt[addr];
+      if (typeof v === 'number') { vals.push({ col, addr, value: v }); gap = 0; }
+      else gap++;
     }
     numByRow.set(key, vals);
     return vals;
@@ -465,6 +482,12 @@ export function searchByLabel(gt, pattern, options = {}) {
       const target = String(caseColumn).toUpperCase();
       const hit = adjacentValues.find(v => v.col === target);
       if (hit) caseValue = hit.value;
+      else {
+        // Directed lookup: probe the exact cell so a scenario column beyond the
+        // adjacent-block probe window is still resolved.
+        const direct = gt[`${c.sheet}!${target}${c.row}`];
+        if (typeof direct === 'number') caseValue = direct;
+      }
       adjacentValues = adjacentValues.slice().sort((a, b) => {
         if (a.col === target) return -1;
         if (b.col === target) return 1;
diff --git a/package.json b/package.json
index 4e00df7..164bc49 100644
--- a/package.json
+++ b/package.json
@@ -41,7 +41,8 @@
     "test:engine": "node pipelines/rust/tests/test-engine-runtime.mjs",
     "test:depgraph": "node pipelines/rust/tests/test-dependency-graph.mjs",
     "test:slimming": "node tests/cli/test-artifact-slimming.mjs",
-    "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs"
+    "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs",
+    "bench:outpost": "node benchmarks/outpost-bench.mjs"
   },
   "devDependencies": {}
 }
diff --git a/tests/cli/test-per-sheet-eval.mjs b/tests/cli/test-per-sheet-eval.mjs
new file mode 100644
index 0000000..5aa108a
--- /dev/null
+++ b/tests/cli/test-per-sheet-eval.mjs
@@ -0,0 +1,74 @@
+#!/usr/bin/env node
+/**
+ * Guard test for eval/per-sheet-eval.mjs.
+ *
+ * per-sheet-eval generates a temp wrapper module per sheet that imports the
+ * sheet's compute() by absolute path. On Windows, ESM rejects a bare absolute
+ * path ("C:\..."); it must be a file:// URL. That bug made EVERY sheet "crash"
+ * at load (0% accuracy) on Windows and on the real engines — and per-sheet-eval
+ * wasn't in CI, so it went unnoticed. This test runs the eval on the committed
+ * smoke engine and asserts no sheet crashed and accuracy is the known-good 100%,
+ * so the regression can't come back (CI runs it on ubuntu AND windows).
+ *
+ * Also exercises --skip-clusters (used by the Outpost benchmark).
+ *
+ * Pure JS; uses the committed smoke chunked fixture (no parser needed).
+ */
+
+import { execFileSync } from 'child_process';
+import { mkdtempSync, cpSync, rmSync, existsSync, readFileSync } from 'fs';
+import { join, dirname } from 'path';
+import { tmpdir } from 'os';
+import { fileURLToPath } from 'url';
+
+const __dir = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(__dir, '..', '..');
+const SMOKE = join(ROOT, 'pipelines', 'rust', 'tests', 'output', 'chunked');
+const EVAL = join(ROOT, 'eval', 'per-sheet-eval.mjs');
+
+let passed = 0, failed = 0;
+const assert = (c, m) => { if (c) passed++; else { failed++; console.error(`  FAIL: ${m}`); } };
+
+if (!existsSync(join(SMOKE, 'engine.js')) || !existsSync(join(SMOKE, '_graph.json'))) {
+  console.log('SKIP: smoke chunked fixture not found');
+  process.exit(0);
+}
+
+// Copy the fixture to a temp dir so per-sheet-eval's _eval_tmp scratch never
+// touches the tracked fixture.
+const tmp = mkdtempSync(join(tmpdir(), 'pse-'));
+const chunked = join(tmp, 'chunked');
+cpSync(SMOKE, chunked, { recursive: true });
+
+function runEval(extraArgs) {
+  const out = join(tmp, `report-${extraArgs.join('') || 'base'}.json`);
+  try {
+    execFileSync('node', [EVAL, chunked, '--output', out, ...extraArgs],
+      { encoding: 'utf-8', stdio: 'pipe', maxBuffer: 32 * 1024 * 1024 });
+  } catch { /* nonzero exit handled via report inspection */ }
+  return existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null;
+}
+
+console.log('Testing: per-sheet-eval runs on the smoke engine (guards the Windows ESM-import fix)');
+{
+  const r = runEval([]);
+  assert(r !== null, 'report written');
+  if (r) {
+    assert(r.summary.sheetsEvaluated >= 3, `evaluated ≥3 sheets (got ${r.summary.sheetsEvaluated})`);
+    assert(r.summary.sheetsWithErrors === 0,
+      `no sheet crashed at import (errors: ${r.summary.sheetsWithErrors}) — guards the abs-path → file:// fix`);
+    assert(r.summary.overallAccuracy === 100, `smoke accuracy 100% (got ${r.summary.overallAccuracy})`);
+  }
+}
+
+console.log('Testing: --skip-clusters produces a report without error');
+{
+  const r = runEval(['--skip-clusters']);
+  assert(r !== null, '--skip-clusters report written');
+  if (r) assert(typeof r.summary.overallAccuracy === 'number', 'summary present with --skip-clusters');
+}
+
+rmSync(tmp, { recursive: true, force: true });
+console.log('');
+console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
+process.exit(failed > 0 ? 1 : 0);

From e3629905ad7ed7bf4943b643e01d0fbf7c146180 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 17:31:12 -0600
Subject: [PATCH 2/8] test(lib): known-answer unit tests for
 irr/waterfall/calibration/sensitivity

The shared financial libs had no direct coverage. Add tests/lib/test-lib.mjs
(43 assertions), wired into `npm test` (runs first):

- irr.mjs: NPV identities; IRR of classic series (-100->+150=50%, -1000 then
  200x8 ~= 11.89%, 3y bullet); Newton == bisection; NPV(IRR) ~= 0; null on no
  sign change; XIRR on dated flows.
- waterfall.mjs: American 80/20 + 8% pref + catch-up (LP/GP splits, carry %),
  no-catch-up, loss case, flat-MOIC-hurdle promote (+ hold-period invariance),
  European builder; LP+GP = distributed conservation across structures.
- calibration.mjs: nested get/set; validateOutputs pass/fail + suggested factor.
- sensitivity.mjs: flattenOutputs group/type filtering.

Polish->Publish: first item (lib/ unit tests) done; CI guards them now.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md           |  17 ++++++
 PLAN.md                |   3 +-
 ROADMAP.md             |  11 ++--
 package.json           |   2 +-
 tests/lib/test-lib.mjs | 136 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 tests/lib/test-lib.mjs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 903078c..fd35ccf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Unit tests for lib/ (Polish→Publish)
+
+The shared financial libraries had no direct coverage. Added
+`tests/lib/test-lib.mjs` (43 known-answer assertions), wired into `npm test`
+(runs first) so CI guards them on every push:
+
+- **`lib/irr.mjs`** — NPV/NPV-derivative identities; IRR of classic cash-flow
+  series (−100→+150 = 50%, −1000 then 200×8 ≈ 11.89%, 3-year bullet); Newton ≡
+  bisection agreement; NPV(IRR) ≈ 0; null on no-sign-change; XIRR on dated flows.
+- **`lib/waterfall.mjs`** — American 80/20 + 8% pref + catch-up (LP/GP splits,
+  carry %), no-catch-up variant, loss case (no carry), the flat-MOIC-hurdle
+  promote (incl. the hold-period-independence invariant), European builder; the
+  LP+GP = distributed conservation invariant across structures.
+- **`lib/calibration.mjs`** — nested get/set; `validateOutputs` pass/fail +
+  suggested corrective factor.
+- **`lib/sensitivity.mjs`** — `flattenOutputs` group/type filtering.
+
 ## 2026-05-28 — Outpost accuracy benchmark + eval-tooling fixes
 
 Stood up a repeatable accuracy + efficacy benchmark over the real ~200 MB
diff --git a/PLAN.md b/PLAN.md
index f16b337..54e470f 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -315,7 +315,8 @@ excel-to-engine/
 - [ ] Wide sheet column disambiguation for blind eval
 
 ## Next Phase — Polish + Publish
-- [ ] Unit tests for all lib/ modules
+- [x] Unit tests for all lib/ modules — `tests/lib/test-lib.mjs` (43: irr,
+      waterfall, calibration, sensitivity), in `npm test`/CI (2026-05-28)
 - [x] GitHub Actions CI — `.github/workflows/ci.yml` (ubuntu + windows; Rust
       build/tests + JS suite + smoke/depgraph/engine/slimming), landed 2026-05-28
 - [ ] npm publish preparation
diff --git a/ROADMAP.md b/ROADMAP.md
index 86be414..b8788ce 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -176,10 +176,13 @@ when we next touch the monitor server or auth surface.
 ## Near-Term
 
 ### Unit Test Suite
-- Tests for `lib/irr.mjs` with known IRR cases
-- Tests for `lib/waterfall.mjs` with standard structures
-- Tests for `lib/calibration.mjs` convergence and edge cases
-- Tests for `lib/excel-parser.mjs` fingerprinting with synthetic workbooks
+- **Done (2026-05-28):** `tests/lib/test-lib.mjs` (43) — `lib/irr.mjs` (known
+  IRR/NPV/XIRR cases), `lib/waterfall.mjs` (American/European/MOIC-hurdle +
+  conservation invariant), `lib/calibration.mjs` (nested get/set, validate),
+  `lib/sensitivity.mjs` (flattenOutputs). In `npm test` / CI.
+- Still open: `lib/calibration.mjs` convergence/edge cases (calibrate loop),
+  `lib/sensitivity.mjs` surface extraction + elasticity/breakpoints, and
+  `lib/excel-parser.mjs` fingerprinting with synthetic workbooks.
 
 ### CI Pipeline
 - **Done (2026-05-28):** `.github/workflows/ci.yml` — on push/PR to `main`,
diff --git a/package.json b/package.json
index 164bc49..4e87a25 100644
--- a/package.json
+++ b/package.json
@@ -41,7 +41,7 @@
     "test:engine": "node pipelines/rust/tests/test-engine-runtime.mjs",
     "test:depgraph": "node pipelines/rust/tests/test-dependency-graph.mjs",
     "test:slimming": "node tests/cli/test-artifact-slimming.mjs",
-    "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs",
+    "test": "node tests/lib/test-lib.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs",
     "bench:outpost": "node benchmarks/outpost-bench.mjs"
   },
   "devDependencies": {}
diff --git a/tests/lib/test-lib.mjs b/tests/lib/test-lib.mjs
new file mode 100644
index 0000000..72f2e0a
--- /dev/null
+++ b/tests/lib/test-lib.mjs
@@ -0,0 +1,136 @@
+#!/usr/bin/env node
+/**
+ * Unit tests for the shared financial libraries in lib/ — known-answer cases for
+ * the pure math (IRR/NPV/XIRR, the PE distribution waterfall) plus the
+ * calibration + sensitivity helpers. These had no direct coverage; CI now guards
+ * them on every push.
+ *
+ * Usage: node tests/lib/test-lib.mjs
+ */
+
+import { npv, npvDerivative, computeIRR, computeIRRBisection, computeXIRR } from '../../lib/irr.mjs';
+import {
+  computeWaterfall, createAmericanWaterfall, createEuropeanWaterfall, createMoicHurdleWaterfall,
+} from '../../lib/waterfall.mjs';
+import { getNestedValue, setNestedValue, validateOutputs } from '../../lib/calibration.mjs';
+import { flattenOutputs } from '../../lib/sensitivity.mjs';
+
+let passed = 0, failed = 0;
+function assert(cond, msg) { if (cond) { passed++; } else { failed++; console.error(`  FAIL: ${msg}`); } }
+function near(a, b, tol, msg) { assert(typeof a === 'number' && Math.abs(a - b) <= tol, `${msg} (got ${a}, want ≈${b} ±${tol})`); }
+
+// ── IRR / NPV / XIRR ─────────────────────────────────────────────────────────
+console.log('Testing: lib/irr.mjs');
+{
+  near(npv([-100, 110], 0.10), 0, 1e-9, 'npv zero at the break-even rate');
+  near(npv([100], 0.5), 100, 1e-12, 'npv of a single t=0 flow is itself');
+  near(npv([0, 100], 0), 100, 1e-12, 'npv at rate 0 is the undiscounted sum');
+  assert(npvDerivative([-100, 110], 0.10) < 0, 'npv decreases as rate rises (negative derivative)');
+
+  near(computeIRR([-100, 150]), 0.50, 1e-4, 'IRR of -100 → +150 is 50%');
+  near(computeIRR([-100, 110]), 0.10, 1e-4, 'IRR of -100 → +110 is 10%');
+  near(computeIRR([-1000, 200, 200, 200, 200, 200, 200, 200, 200]), 0.1189, 1e-3, 'IRR of -1000 then 200×8 ≈ 11.89%');
+  near(computeIRR([-1000, 0, 0, 1100]), 0.03228, 1e-3, 'IRR of -1000 → +1100 in 3y ≈ 3.23%');
+  // NPV(IRR) ≈ 0 sanity for a found rate
+  const r = computeIRR([-500, 100, 200, 300]);
+  near(npv([-500, 100, 200, 300], r), 0, 1e-5, 'NPV at the solved IRR is ~0');
+
+  assert(computeIRR([10, 20]) === null, 'no sign change → null IRR');
+  assert(computeIRR([-10]) === null, 'single flow → null IRR');
+
+  near(computeIRRBisection([-100, 150]), 0.50, 1e-4, 'bisection agrees: 50%');
+  near(computeIRRBisection([-1000, 200, 200, 200, 200, 200, 200, 200, 200]), 0.1189, 1e-3, 'bisection agrees: ~11.89%');
+
+  const xirr = computeXIRR([
+    { date: new Date('2020-01-01'), amount: -1000 },
+    { date: new Date('2021-01-01'), amount: 1100 },
+  ]);
+  near(xirr, 0.10, 2e-3, 'XIRR of -1000 → +1100 one year later ≈ 10%');
+}
+
+// ── Waterfall ────────────────────────────────────────────────────────────────
+console.log('Testing: lib/waterfall.mjs');
+{
+  // Standard American 80/20, 8% pref, full catch-up. 200M proceeds on 100M
+  // equity, 1y hold (simple pref = 100M × 8% = 8M).
+  const american = createAmericanWaterfall({ prefReturn: 0.08, carryPercent: 0.20, residualLPSplit: 0.80, hasCatchup: true });
+  assert(american.length === 4, `American (with catch-up) has 4 tiers (got ${american.length})`);
+  const w = computeWaterfall(200_000_000, 100_000_000, american, { holdPeriodYears: 1 });
+  near(w.totalDistributed, 200_000_000, 1, 'all proceeds distributed');
+  near(w.gpTotal, 34_400_000, 1, 'GP carry = catch-up 20M + residual 14.4M = 34.4M');
+  near(w.lpTotal, 165_600_000, 1, 'LP = ROC 100M + pref 8M + residual 57.6M = 165.6M');
+  near(w.lpTotal + w.gpTotal, w.totalDistributed, 1, 'conservation: LP + GP = distributed');
+  near(w.gpCarryPercent, 0.344, 1e-4, 'GP carry % of profit = 34.4%');
+  near(w.undistributed, 0, 1, 'nothing left undistributed');
+
+  // No catch-up variant has one fewer tier and less GP.
+  const noCatchup = createAmericanWaterfall({ prefReturn: 0.08, carryPercent: 0.20, residualLPSplit: 0.80, hasCatchup: false });
+  assert(noCatchup.length === 3, `American (no catch-up) has 3 tiers (got ${noCatchup.length})`);
+  const w2 = computeWaterfall(200_000_000, 100_000_000, noCatchup, { holdPeriodYears: 1 });
+  assert(w2.gpTotal < w.gpTotal, 'no-catch-up GP carry is lower than with catch-up');
+  near(w2.lpTotal + w2.gpTotal, w2.totalDistributed, 1, 'conservation holds (no catch-up)');
+
+  // Loss case: proceeds below equity → no carry, LP gets everything available.
+  const loss = computeWaterfall(80_000_000, 100_000_000, american, { holdPeriodYears: 1 });
+  near(loss.gpTotal, 0, 1, 'no GP carry on a loss');
+  near(loss.lpTotal, 80_000_000, 1, 'LP receives all proceeds on a loss');
+  near(loss.gpCarryPercent, 0, 1e-9, 'GP carry % is 0 when there is no profit');
+
+  // Flat MOIC hurdle (no IRR pref): 1.40x hurdle, 20% promote, 2.0x MOIC.
+  const moic = createMoicHurdleWaterfall({ hurdleMOIC: 1.40, carryPercent: 0.20 });
+  const w3 = computeWaterfall(200_000_000, 100_000_000, moic);
+  near(w3.gpTotal, 12_000_000, 1, 'MOIC-hurdle GP = 20% × (200M − 140M) = 12M');
+  near(w3.lpTotal, 188_000_000, 1, 'MOIC-hurdle LP = 188M');
+  // The flat MOIC hurdle must NOT move with hold period (documented invariant).
+  const w3long = computeWaterfall(200_000_000, 100_000_000, moic, { holdPeriodYears: 10 });
+  near(w3long.gpTotal, w3.gpTotal, 1, 'flat MOIC hurdle is hold-period-independent');
+
+  // European builder produces a usable, ordered tier set.
+  const euro = createEuropeanWaterfall([
+    { hurdle: 0.08, carry: 0.00 }, { hurdle: 0.12, carry: 0.20 }, { hurdle: Infinity, carry: 0.30 },
+  ]);
+  assert(euro[0].type === 'return_of_capital', 'European waterfall starts with return of capital');
+  const w4 = computeWaterfall(150_000_000, 100_000_000, euro, { holdPeriodYears: 1 });
+  near(w4.lpTotal + w4.gpTotal, w4.totalDistributed, 1, 'conservation holds (European)');
+}
+
+// ── Calibration helpers ──────────────────────────────────────────────────────
+console.log('Testing: lib/calibration.mjs');
+{
+  assert(getNestedValue({ a: { b: { c: 42 } } }, 'a.b.c') === 42, 'getNestedValue reads a deep path');
+  assert(getNestedValue({ a: {} }, 'a.b.c') === undefined, 'getNestedValue returns undefined for a missing path');
+
+  const obj = {};
+  setNestedValue(obj, 'x.y.z', 5);
+  assert(obj.x && obj.x.y && obj.x.y.z === 5, 'setNestedValue creates intermediate objects');
+
+  const v = validateOutputs(
+    { returns: { moic: 2.00, irr: 0.20 } },
+    [{ key: 'returns.moic', excelValue: 2.00 }, { key: 'returns.irr', excelValue: 0.25 }],
+    { tolerance: 0.01 },
+  );
+  assert(v.totalCount === 2, 'validateOutputs reports total count');
+  assert(v.passCount === 1 && v.failCount === 1, 'validateOutputs: moic passes, irr (0.20 vs 0.25) fails at 1% tol');
+  assert(v.allPassed === false, 'validateOutputs.allPassed false when any fails');
+  const irrRes = v.results.find(r => r.key === 'returns.irr');
+  near(irrRes.suggestedFactor, 0.25 / 0.20, 1e-9, 'validateOutputs suggests the corrective factor');
+}
+
+// ── Sensitivity helpers ──────────────────────────────────────────────────────
+console.log('Testing: lib/sensitivity.mjs');
+{
+  const flat = flattenOutputs({
+    returns: { moic: 2.0, irr: 0.2, label: 'skip-strings' },
+    waterfall: { gpCarry: 1_000_000 },
+    ignoredGroup: { x: 99 },
+  });
+  assert(flat['returns.moic'] === 2.0 && flat['returns.irr'] === 0.2, 'flattenOutputs flattens numeric outputs');
+  assert(flat['waterfall.gpCarry'] === 1_000_000, 'flattenOutputs includes the waterfall group');
+  assert(!('returns.label' in flat), 'flattenOutputs drops non-numeric values');
+  assert(!('ignoredGroup.x' in flat), 'flattenOutputs only includes known output groups');
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+console.log('');
+console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
+process.exit(failed > 0 ? 1 : 0);

From 729a4d46aac6731cb3f42c23cf556ff0839f417d Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 17:43:03 -0600
Subject: [PATCH 3/8] perf(eval): scope cluster convergence diff to written
 cells + first cluster test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cluster convergence loop in per-sheet-eval checked for a fixed point by
diffing EVERY cell in the context each iteration — and the context is seeded
with the full (multi-million-cell) ground truth, so it was O(all cells) × up to
200 iters. Now it tracks the cells compute() writes (ctx._written) and diffs
only those (the cluster's own outputs). Behavior-preserving.

Adds the first circular-cluster coverage: tests/cli/fixtures/cluster-model/ (a
synthetic SheetA<->SheetB model converging to a=50,b=50,c=100,d=100) and a case
in test-per-sheet-eval that runs it through the convergence loop, asserting 100%.

Measured on the real model: scoped-diff alone is NOT enough — per-sheet-eval
re-runs the whole cluster convergence once per member sheet (17x), and engine
inaccuracies keep clusters from converging (200 iters). Remaining key fix is
single-pass orchestrator eval (converge once, score all members); the fixture
is the ready test oracle. Benchmark still uses --skip-clusters until then.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                  | 26 +++++++++++++++++++
 ROADMAP.md                                    | 13 +++++++---
 eval/per-sheet-eval.mjs                       | 26 ++++++++++---------
 .../cluster-model/chunked/_graph.json         |  9 +++++++
 .../cluster-model/chunked/_ground-truth.json  |  1 +
 .../cluster-model/chunked/sheets/SheetA.mjs   |  8 ++++++
 .../cluster-model/chunked/sheets/SheetB.mjs   |  7 +++++
 tests/cli/test-per-sheet-eval.mjs             | 26 +++++++++++++++++++
 8 files changed, 100 insertions(+), 16 deletions(-)
 create mode 100644 tests/cli/fixtures/cluster-model/chunked/_graph.json
 create mode 100644 tests/cli/fixtures/cluster-model/chunked/_ground-truth.json
 create mode 100644 tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs
 create mode 100644 tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd35ccf..2c3865d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,31 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Circular-cluster eval: scoped convergence diff + first cluster test
+
+Progress on the circular-cluster accuracy blocker (the 17-of-21-sheet cluster on
+the real models that wouldn't evaluate).
+
+- **Scoped convergence diff.** The cluster convergence loop in
+  `per-sheet-eval.mjs` checked for a fixed point by diffing **every** cell in the
+  context each iteration — and the context is seeded with the full (multi-million-
+  cell) ground truth, so that was O(all cells) × up to 200 iterations. It now
+  tracks the cells `compute()` actually writes (`ctx._written`) and diffs only
+  those (the cluster's own outputs). Behavior-preserving; large constant-factor
+  win on big clusters.
+- **First circular-cluster test + fixture.** `tests/cli/fixtures/cluster-model/`
+  is a synthetic 2-sheet circular model (SheetA ↔ SheetB, converges to
+  a=50,b=50,c=100,d=100). `tests/cli/test-per-sheet-eval.mjs` now evaluates it
+  through the convergence loop and asserts 100% — the cluster path had no
+  coverage before, and this guards the scoped-diff change.
+
+**Still the key fix (cluster-once):** measured on the real model, scoped-diff
+alone is *not* enough — `per-sheet-eval` re-runs the entire cluster convergence
+**once per member sheet** (17×), and engine inaccuracies keep some clusters from
+converging (200 iters). The remaining work is single-pass orchestrator eval:
+converge the cluster once, then score every member from that converged state
+(one task per cluster, not per sheet). The fixture above is the ready-made test
+oracle. Until then the benchmark runs with `--skip-clusters`.
+
 ## 2026-05-28 — Unit tests for lib/ (Polish→Publish)
 
 The shared financial libraries had no direct coverage. Added
diff --git a/ROADMAP.md b/ROADMAP.md
index b8788ce..0b6a6d7 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -167,10 +167,15 @@ when we next touch the monitor server or auth surface.
   per member sheet (O(cluster²)) — that's why clustered big models "won't
   evaluate." The array-formula Headcount sheet lives inside this cluster, so it's
   unmeasurable until this is fixed. `--skip-clusters` skips them for now.
-- **Fix:** single-pass orchestrator eval — converge the cluster once, then score
-  every member sheet from that converged state (then drop `--skip-clusters` from
-  the benchmark). Also scope the convergence diff to written cells (it currently
-  diffs all ~6M seeded cells per iteration).
+- **Done (2026-05-28):** scoped the convergence diff to written cells
+  (`ctx._written`) instead of all ~6M seeded cells per iteration. Added a
+  synthetic 2-sheet circular fixture (`tests/cli/fixtures/cluster-model/`) + the
+  first cluster test. Measured: scoped-diff alone is **not** enough — the 17×
+  per-member redundancy dominates.
+- **Remaining key fix (cluster-once):** single-pass orchestrator eval — converge
+  the cluster once, then score every member from that converged state (one task
+  per cluster, not per sheet); then drop `--skip-clusters` from the benchmark.
+  The cluster fixture is the ready test oracle.
 - Consider lazy subgraph evaluation (only compute transitive closure of targets).
 
 ## Near-Term
diff --git a/eval/per-sheet-eval.mjs b/eval/per-sheet-eval.mjs
index 6111cce..3ee442c 100644
--- a/eval/per-sheet-eval.mjs
+++ b/eval/per-sheet-eval.mjs
@@ -212,21 +212,22 @@ async function main() {
   const clusterFns = [${clusterModules.map(m => `compute_${m.sanitized}`).join(', ')}];
   const MAX_ITER = 200;
   const TOL = 1e-6;
-  let prevSnapshot = {};
+  const prevSnapshot = {};
   for (let _ci = 0; _ci < MAX_ITER; _ci++) {
     for (const fn of clusterFns) fn(ctx);
-    // Check convergence on numeric values
+    // Convergence is about the cluster's *computed* cells stabilizing, so diff
+    // only the cells the cluster wrote (ctx._written) — not every seeded
+    // ground-truth cell. On a model with millions of seeded cells the old
+    // O(all-cells)-per-iteration diff was a dominant cost (and × 200 iters).
     let maxDelta = 0;
-    const snapshot = {};
-    for (const [k, v] of Object.entries(ctx.values)) {
-      if (typeof v === 'number') {
-        snapshot[k] = v;
-        const prev = prevSnapshot[k] || 0;
-        const d = Math.abs(v - prev);
-        if (d > maxDelta) maxDelta = d;
-      }
+    for (const k of ctx._written) {
+      const v = ctx.values[k];
+      if (typeof v !== 'number') continue;
+      const prev = prevSnapshot[k] || 0;
+      const d = Math.abs(v - prev);
+      if (d > maxDelta) maxDelta = d;
+      prevSnapshot[k] = v;
     }
-    prevSnapshot = snapshot;
     if (_ci > 0 && maxDelta < TOL) break;
   }
 `
@@ -247,8 +248,9 @@ const cn = s => { let n=0; for(const c of s) n = n*26+c.charCodeAt(0)-64; return
 const nc = n => { let s=''; while(n>0){n--;s=String.fromCharCode(65+(n%26))+s;n=Math.floor(n/26);} return s; };
 const ctx = {
   values: {},
+  _written: new Set(),  // cells written by compute() — the cluster convergence diffs only these
   get(addr) { return this.values[addr] !== undefined ? this.values[addr] : 0; },
-  set(addr, value) { this.values[addr] = value; },
+  set(addr, value) { this.values[addr] = value; this._written.add(addr); },
   _parseRange(rangeStr) {
     const m = rangeStr.match(/^(.+)!([A-Z]+)(\\d+):([A-Z]+)(\\d+)$/);
     if (!m) return null;
diff --git a/tests/cli/fixtures/cluster-model/chunked/_graph.json b/tests/cli/fixtures/cluster-model/chunked/_graph.json
new file mode 100644
index 0000000..c2082a3
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/_graph.json
@@ -0,0 +1,9 @@
+{
+  "topoOrder": ["SheetA", "SheetB"],
+  "sheetClusters": [["SheetA", "SheetB"]],
+  "edges": {
+    "SheetA!b": ["SheetB!d"],
+    "SheetB!c": ["SheetA!a", "SheetA!b"],
+    "SheetB!d": ["SheetB!c"]
+  }
+}
diff --git a/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json b/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json
new file mode 100644
index 0000000..14ab3d7
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/_ground-truth.json
@@ -0,0 +1 @@
+{"SheetA!a":50,"SheetA!b":50,"SheetB!c":100,"SheetB!d":100}
diff --git a/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs
new file mode 100644
index 0000000..97c20a8
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetA.mjs
@@ -0,0 +1,8 @@
+// Synthetic circular-cluster fixture (SheetA ↔ SheetB) for per-sheet-eval tests.
+// SheetA reads SheetB!d; SheetB reads SheetA. Converges to a=50, b=50, c=100, d=100.
+export const SHEET_NAME = 'SheetA';
+export const SHEET_DEPENDENCIES = ['SheetB'];
+export function compute(ctx) {
+  ctx.set('SheetA!a', 50);                       // constant
+  ctx.set('SheetA!b', ctx.get('SheetB!d') / 2);  // reads across the cluster
+}
diff --git a/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs
new file mode 100644
index 0000000..576c849
--- /dev/null
+++ b/tests/cli/fixtures/cluster-model/chunked/sheets/SheetB.mjs
@@ -0,0 +1,7 @@
+// Synthetic circular-cluster fixture (SheetA ↔ SheetB) for per-sheet-eval tests.
+export const SHEET_NAME = 'SheetB';
+export const SHEET_DEPENDENCIES = ['SheetA'];
+export function compute(ctx) {
+  ctx.set('SheetB!c', ctx.get('SheetA!a') + ctx.get('SheetA!b')); // reads across the cluster
+  ctx.set('SheetB!d', ctx.get('SheetB!c'));
+}
diff --git a/tests/cli/test-per-sheet-eval.mjs b/tests/cli/test-per-sheet-eval.mjs
index 5aa108a..073df45 100644
--- a/tests/cli/test-per-sheet-eval.mjs
+++ b/tests/cli/test-per-sheet-eval.mjs
@@ -68,6 +68,32 @@ console.log('Testing: --skip-clusters produces a report without error');
   if (r) assert(typeof r.summary.overallAccuracy === 'number', 'summary present with --skip-clusters');
 }
 
+console.log('Testing: circular-cluster convergence (synthetic SheetA↔SheetB fixture)');
+{
+  const CLUSTER = join(ROOT, 'tests', 'cli', 'fixtures', 'cluster-model', 'chunked');
+  if (existsSync(join(CLUSTER, '_graph.json'))) {
+    const ctmp = mkdtempSync(join(tmpdir(), 'pse-cl-'));
+    const cchunked = join(ctmp, 'chunked');
+    cpSync(CLUSTER, cchunked, { recursive: true });
+    const out = join(ctmp, 'report.json');
+    try {
+      execFileSync('node', [EVAL, cchunked, '--output', out], { encoding: 'utf-8', stdio: 'pipe', maxBuffer: 32 * 1024 * 1024 });
+    } catch { /* inspect report */ }
+    const r = existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null;
+    assert(r !== null, 'cluster report written');
+    if (r) {
+      assert(r.summary.sheetsEvaluated === 2, `both cluster sheets evaluated (got ${r.summary.sheetsEvaluated})`);
+      assert(r.summary.sheetsWithErrors === 0, `cluster converged without error (errors: ${r.summary.sheetsWithErrors})`);
+      // Converges to a=50,b=50,c=100,d=100 — exercises the convergence loop and
+      // the scoped (written-cells-only) convergence diff.
+      assert(r.summary.overallAccuracy === 100, `cluster fixture 100% via convergence (got ${r.summary.overallAccuracy})`);
+    }
+    rmSync(ctmp, { recursive: true, force: true });
+  } else {
+    console.log('  (skip: cluster fixture missing)');
+  }
+}
+
 rmSync(tmp, { recursive: true, force: true });
 console.log('');
 console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);

From 20acc204b1a2643bb7ca785c8ebac9ce1dda37ce Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 19:57:09 -0600
Subject: [PATCH 4/8] docs(roadmap): capture Outpost regeneration findings from
 the Mippy consumer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The downstream Mippy agent regenerated both Outpost engines from main and
reported back. New ROADMAP section "Now — Outpost regeneration findings":

Confirmed wins vs the old build — dates fixed (old leaked ExcelDateTime debug
strings, 2,686 in A-1; new = serial numbers, 0 leaks), ~42-45% smaller
(model-map.json + GT-copy _computed-values.json gone), contract maps emitted,
circular refs converge, and a golden-master PASS (regenerated GT reproduces the
hand-port's canonical A-1 returns to full float precision, Version Tracker row 22).

New follow-ups: generation robustness on big models (dep-graph OOM + init 10-min
timeout, issue #23), --output-profile to scope artifacts (#22), 11,813 _fn()
unsupported-function fallbacks per engine (transpiler-coverage accuracy suspect,
added to Transpiler Coverage), refiner mis-mapping returns to a "UW Comparison"
tab (needs canonical-returns-tab recognition), empty named-inputs.json when no
formula-referenced defined-names exist, MIP-as-output (#7). Noted a ready
golden-master CI assert (diff committed named-outputs.baseCaseValue).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md | 18 ++++++++++++++++
 ROADMAP.md   | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c3865d..a7a075f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Roadmap: Outpost regeneration findings (Mippy consumer)
+
+The downstream Mippy agent regenerated both Outpost engines from `main` and
+reported back. Captured the findings in ROADMAP.md ("Now — Outpost regeneration
+findings"). Confirmed wins vs the old build: **dates fixed** (old leaked
+`ExcelDateTime { … }` debug strings — 2,686 in A-1; new emits serial numbers, 0
+leaks), **~42–45% smaller** (model-map.json + the GT-copy `_computed-values.json`
+gone), contract maps emitted, circular refs converge, and a **golden-master PASS**
+— the regenerated ground truth reproduces the hand-port's canonical A-1 returns
+to full float precision (Version Tracker row 22). New follow-ups: generation
+robustness on big models (dep-graph OOM + `init` 10-min timeout — issue #23),
+`--output-profile` to scope artifacts (#22), the **11,813 `_fn()` unsupported-
+function fallbacks** per engine (transpiler-coverage accuracy suspect), the
+refiner mis-mapping returns to a "UW Comparison" tab, empty `named-inputs.json`
+when no formula-referenced defined-names exist, and MIP-as-output (#7). A
+ready-made golden-master CI assert (diff committed `named-outputs.baseCaseValue`)
+is noted.
+
 ## 2026-05-28 — Circular-cluster eval: scoped convergence diff + first cluster test
 
 Progress on the circular-cluster accuracy blocker (the 17-of-21-sheet cluster on
diff --git a/ROADMAP.md b/ROADMAP.md
index 0b6a6d7..6053cb9 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -42,6 +42,59 @@ RPC service. Round 1 split into a low-risk JS half (done) and a Rust half
   (engine faithfully reproduces an Excel base case of 0). Surface via a
   `requiredFor` field if/when named-inputs gains one.
 
+## Now — Outpost regeneration findings (Mippy consumer, 2026-05-28)
+
+The downstream Mippy agent regenerated both Outpost engines from `main` (current
+excel-to-engine) → `engines/outpost-a{1,2}-v2/`, old build left alongside.
+Confirmed the new build is clearly better and surfaced concrete follow-ups.
+Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
+
+**Confirmed better than the old (pre-our-work) build:**
+- **Dates fixed.** Old leaked Rust debug strings (`ExcelDateTime { value: 45960.0,
+  … }` — 2,686 in A-1, breaking date math); new emits serial numbers, 0 leaks.
+- **~42–45% smaller** (~1.9–2.0 GB → ~1.1 GB): `model-map.json` (606 MB) +
+  `_computed-values.json` (192 MB) gone — the #8 slimming + dropping the GT-copy.
+- Semantic manifest + ADR-017 contract maps emitted; circular refs now run
+  per-cluster fixed-point loops.
+- **Golden master PASS.** Regenerated `_ground-truth.json` reproduces the
+  hand-port's canonical A-1 returns to full float precision (Version Tracker
+  row 22: grossMOIC L22 ≈2.34916, grossIRR M22 ≈0.19233, netMOIC T22 ≈2.23137,
+  netIRR U22 ≈0.18240). Pinning A-1's manifest to those cells makes
+  `named-outputs.baseCaseValue` a ready CI golden-master assert. **Do this:** add
+  a golden-master test that diffs the committed contract JSON's baseCaseValues.
+
+**Open follow-ups:**
+- **Generation robustness on big models ([#23]) — blocks a clean full build.**
+  Plain `ete init` hit its 10-min `spawnSync` cap, and the Rust parser was
+  OOM-killed at the cell-level dependency-graph step → `engine.js` (the `run()`
+  orchestrator) and `dependency-graph.json` closures **didn't land** (written
+  after the OOM step); regen needed direct-parse then `--reuse-parse`. Needs:
+  stream/incrementalize the dep-graph build (or raise its memory headroom),
+  within-sheet parallelism, streaming writes, and a higher/configurable init
+  timeout.
+- **`--output-profile` / guided `ete create` ([#22]).** Skip the ~752 MB
+  per-sheet engine emit when a consumer only needs ground truth + contract maps.
+- **Transpiler coverage — 11,813 `_fn()` fallbacks (unchanged old→new).** That
+  many formula cells still transpile to a generic unsupported-function stub — a
+  prime accuracy suspect once cluster eval makes per-sheet accuracy measurable.
+  Inventory the missing Excel functions and prioritize by frequency. (See
+  Transpiler Coverage below.)
+- **Refiner mis-maps returns to the "UW Comparison" tab.** Auto-manifest picked an
+  underwriting-comparison cell (2.305x) over the canonical Version Tracker returns
+  (2.349x) — `SUMMARY_SHEET_PATTERN` over-ranks "UW Comparison". The refiner
+  should recognize canonical returns / Version-Tracker tabs (or de-prioritize
+  underwriting-comparison tabs) so returns don't need manual per-model pinning.
+- **`named-inputs.json` empty** when a workbook exposes no formula-referenced
+  defined-names (the Outpost case) — ADR-019 ranged inputs can't be auto-derived;
+  needs a heuristic fallback or a documented manual-input path.
+- **MIP isn't a generated output (request #7).** The $51.8M is a hand-port
+  calibration, not a single GT cell — MIP is modeled across per-block "MIP
+  Proceeds" cells. Surface via a `requiredFor`/aggregate mapping, not a
+  single-cell expectation. (See the Round 2 MIP-gating note above.)
+
+[#22]: https://github.com/ebootheee/excel-to-engine/issues/22
+[#23]: https://github.com/ebootheee/excel-to-engine/issues/23
+
 ## Now — Security Hardening Follow-ups (post-PR #13)
 
 Non-blocking items surfaced during the v0.2.0 security audit pass. Open
@@ -137,6 +190,11 @@ when we next touch the monitor server or auth surface.
 ## Ongoing — Accuracy Improvement + Production Learnings
 
 ### Transpiler Coverage
+- **Measured (Mippy regen, 2026-05-28): 11,813 `_fn()` unsupported-function
+  fallbacks per Outpost engine** — that many formula cells transpile to a generic
+  stub instead of real logic, a prime accuracy suspect. First step: inventory
+  which Excel functions hit the fallback and rank by frequency, then implement
+  the top offenders. (Was unchanged old→new, so it predates our work.)
 - Implement INDIRECT function (dynamic cell references)
 - Fix 2D range handling edge cases for very large sheets
 - Handle array formulas / CSE (Ctrl+Shift+Enter) patterns

From df478dcf75ee78f8f02672720faa53a429d36c56 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 20:05:04 -0600
Subject: [PATCH 5/8] =?UTF-8?q?docs:=20add=20HANDOFF.md=20=E2=80=94=20prio?=
 =?UTF-8?q?ritized=20next-session=20plan=20+=20state=20+=20gotchas?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fresh-agent entry point: priority-ordered backlog (P0 cluster-once eval ->
generation robustness #23 -> _fn() transpiler coverage -> refiner UW-Comparison
mis-map -> golden-master CI -> output-profile/large-sheet/pipeline-perf ->
Polish), current state (PR #21 open with the foundation; baseline a1 84.3% /
a2 85.5% standalone), run commands, and the gotchas. PLAN points to it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md |   9 ++++
 HANDOFF.md   | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++
 PLAN.md      |   4 ++
 3 files changed, 152 insertions(+)
 create mode 100644 HANDOFF.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7a075f..a3e7bcd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — HANDOFF.md (fresh-agent entry point)
+
+Added `HANDOFF.md` — the prioritized next-session plan (P0 cluster-once eval →
+generation robustness #23 → `_fn()` transpiler coverage → refiner UW-Comparison
+fix → golden-master CI → output-profile/large-sheet/perf → Polish), with current
+state, run commands, and the gotchas (gitignored real models, the GT-copy
+`_computed-values.json`, the per-sheet-eval Windows fix, the bench
+`discoverModels` gate vs the `-v2` regen). PLAN points to it.
+
 ## 2026-05-28 — Roadmap: Outpost regeneration findings (Mippy consumer)
 
 The downstream Mippy agent regenerated both Outpost engines from `main` and
diff --git a/HANDOFF.md b/HANDOFF.md
new file mode 100644
index 0000000..620b3b7
--- /dev/null
+++ b/HANDOFF.md
@@ -0,0 +1,139 @@
+# HANDOFF — excel-to-engine next session
+
+Start-here doc for a fresh agent. Read this, then `ROADMAP.md` (full backlog),
+`PLAN.md` (status), `benchmarks/BASELINE.md` (accuracy numbers), and the two
+memory files (`project_outpost_models_shape`, `project_mippy_contract`).
+
+_Last updated: 2026-05-28._
+
+## Where things stand
+
+**Merged to `main` this session:** artifact slimming (#17), GitHub Actions CI
+(#18, ubuntu+windows), `refine` consumes `_labels.json` + lazy numerics (#19),
+single-GT-parse per `init` (#20).
+
+**Open PR — review/merge first:** **#21 `feat/next-wave`** (CI green). Contains
+the Outpost accuracy **benchmark + baseline**, a **per-sheet-eval Windows crash
+fix**, `searchByLabel` lazy numerics, **lib/ unit tests** (43), the
+**scoped cluster-convergence diff** + the first circular-cluster fixture/test,
+and the Mippy regeneration findings in ROADMAP. **If #21 isn't merged yet, branch
+off `feat/next-wave`; otherwise off `main`.**
+
+**Baseline (real models, `npm run bench:outpost`):** outpost-a1 **84.3%**,
+outpost-a2 **85.5%** — but **standalone sheets only**. The 17-sheet circular
+cluster and the 190 MB PP&E sheet are skipped, so ~80% of each model is currently
+unmeasured (see P0 below).
+
+## How to run
+
+```bash
+npm test                 # full JS suite (387 assertions): lib, cli, manifest, eval, etc.
+npm run smoke            # chunked-engine accuracy 78/78
+npm run bench:outpost --  --root "<abs path>/engines"   # accuracy + efficacy on the real models
+# per-sheet accuracy on one engine (skips clusters by default in the bench):
+node eval/per-sheet-eval.mjs <chunkedDir> --concurrency 3 [--skip-clusters]
+cd pipelines/rust && cargo build --release   # the parser (needed by smoke/slimming/bench)
+```
+
+The real Outpost models live in the **gitignored** `engines/` dir (proprietary —
+never commit values/labels). The Mippy agent's fresh regen is in
+`engines/outpost-a{1,2}-v2/` (the *better* build: dates fixed, slimmed); the old
+build is alongside in `engines/outpost-a{1,2}/`.
+
+## Prioritized backlog (do in this order)
+
+### P0 — Cluster-once eval (THE keystone) ★ highest impact
+Unblocks measuring ~80% of each model. The circular cluster is **17 of 21
+sheets** (and the array-formula `Headcount` sheet is *inside* it). Today
+`eval/per-sheet-eval.mjs` re-runs the **entire** cluster convergence once per
+member sheet (17×) → it won't finish on the real model, so the benchmark runs
+`--skip-clusters`. **Fix:** one task per cluster — converge once, score every
+member from that converged state — then drop `--skip-clusters` from the
+benchmark and re-baseline.
+- Files: `eval/per-sheet-eval.mjs` (task-building loop ~120-185, `evalOneSheet`
+  ~190-365, the cluster `evalScript` template ~230-320, aggregation ~375-390).
+- Test oracle READY: `tests/cli/fixtures/cluster-model/` (synthetic SheetA↔SheetB
+  converging to a=50,b=50,c=100,d=100). Add a cluster-once assertion to
+  `tests/cli/test-per-sheet-eval.mjs` (it already runs that fixture).
+- Validate: fixture 100% + smoke unaffected + then a real `--with-clusters` run
+  on `engines/outpost-a1-v2` completes in reasonable time and yields a cluster
+  accuracy. The scoped-diff (already landed) helps but is NOT sufficient alone.
+
+### P0/P1 — Generation robustness on big models (issue #23) — blocks clean builds
+A clean `ete init` on a real model **does not complete**: the Rust parser is
+OOM-killed at the cell-level dependency-graph step, and `ete init`'s 10-min
+`spawnSync` cap times out → `engine.js` (the `run()` orchestrator) and
+`dependency-graph.json` closures don't land (they're written after the OOM
+step); the Mippy regen worked around it with direct-parse + `--reuse-parse`.
+- Fix directions: stream/incrementalize the dep-graph build or raise its memory
+  headroom; within-sheet parallelism; streaming writes; configurable init
+  timeout. Mostly `pipelines/rust/` + `cli/commands/init.mjs` timeout.
+- Impact: until fixed, the downstream consumer can't get a clean full artifact
+  set (closures + orchestrator) from one command.
+
+### P1 — Transpiler coverage: 11,813 `_fn()` fallbacks ★ big accuracy lever
+That many formula cells per engine transpile to a generic unsupported-function
+stub (unchanged old→new, so it predates this work). Almost certainly a large
+slice of the ~15% standalone-sheet gap. **Inventory which Excel functions hit
+the `_fn()` fallback, rank by frequency, implement the top offenders.**
+- Files: `pipelines/rust/src/` (transpiler). Measure on `engines/*-v2`.
+
+### P1 — Refiner mis-maps returns to the "UW Comparison" tab ★ quick + concrete
+Auto-manifest picks an underwriting-comparison cell (2.305x) over the canonical
+Version Tracker returns (2.349x) because `SUMMARY_SHEET_PATTERN` over-ranks
+"UW Comparison" — forcing manual per-model pinning. Make the refiner recognize
+canonical returns / "Version Tracker" tabs, or de-prioritize
+underwriting-comparison tabs.
+- File: `cli/commands/manifest-refine.mjs` (`SUMMARY_SHEET_PATTERN` line ~24,
+  ranking in `searchForFieldIndexed`). Add/extend a manifest **invariant** so it
+  can't silently revert. Validate with `tests/cli/test-refine-label-index.mjs`.
+
+### P1 — Golden-master CI assert ★ near-free regression guard
+A-1's regenerated ground truth reproduces the hand-port's canonical returns to
+full float precision (Version Tracker row 22: grossMOIC L22 ≈2.34916, grossIRR
+M22 ≈0.19233, netMOIC T22 ≈2.23137, netIRR U22 ≈0.18240). The committed
+`named-outputs.json` `baseCaseValue`s (for a pinned A-1 manifest) make a ready
+golden-master. **Add a CI test that diffs those against the known values.** Note:
+the engine artifacts are gitignored; commit only the small contract JSON (or
+hard-code the canonical values in the test).
+
+### P2 — `--output-profile` / guided `ete create` (issue #22)
+Skip the ~752 MB per-sheet engine emit when a consumer only needs ground truth +
+contract maps. Scope artifacts to the actual need.
+
+### P2 — Large-sheet eval (190 MB PP&E)
+Exceeds the 150 MB per-sheet limit in `per-sheet-eval` → skipped. Needs
+streaming/sharded per-sheet eval or a higher limit with chunked compute.
+
+### P2 — Manifest-pipeline perf on ~6M-cell models
+`generate` detectors, `maps` cell-type pass, and `refine`'s `buildLabelIndex`
+fallback are O(N) on the full GT and slow. Profile + optimize. (Distinct from the
+Rust-side #23; this is the JS pipeline.)
+
+### P3 — Polish → Publish remainder
+lib/ unit tests done. Remaining: npm publish prep (`bin`, `files`, repo
+metadata), synthetic example project, contributing guide. (Arguably hold publish
+until accuracy blockers close.)
+
+### P3 — Lower priority / model-owner
+- `named-inputs.json` is empty when a workbook has no formula-referenced
+  defined-names (the Outpost case) — heuristic fallback or documented manual path.
+- MIP-as-output (request #7): modeled across per-block "MIP Proceeds" cells, not
+  a single GT cell — a model-owner question, surface via aggregate mapping.
+
+## Gotchas (will bite you)
+
+- **`engines/` is gitignored** (real financials). Read-only; report only
+  aggregate metrics. `_eval_tmp/` and `benchmarks/results/` are gitignored too.
+- **`_computed-values.json` in these engines is a byte-identical COPY of ground
+  truth** (seeded). It is NOT a valid accuracy source — accuracy must be live
+  recompute (per-sheet-eval). The benchmark already avoids it.
+- **per-sheet-eval was Windows-broken** (bare absolute ESM import). Fixed via
+  `pathToFileURL`; guarded by `tests/cli/test-per-sheet-eval.mjs` on windows CI.
+  Don't reintroduce bare absolute `import` paths.
+- **`benchmarks/outpost-bench.mjs` `discoverModels()` gates on `engine.js`** —
+  but the `-v2` regen dirs may LACK `engine.js` (OOM, see #23) while having
+  `_graph.json` + `sheets/` (what per-sheet-eval actually needs). If the bench
+  skips `-v2`, relax the gate to `_graph.json` + `sheets/`.
+- **CI runs ubuntu + windows.** Anything touching child-process paths or the
+  parser binary must work on both.
diff --git a/PLAN.md b/PLAN.md
index 54e470f..331f080 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -1,5 +1,9 @@
 # excel-to-engine — Plan
 
+> **Next session: start at [`HANDOFF.md`](HANDOFF.md)** — prioritized backlog
+> (P0 cluster-once eval → generation robustness → `_fn()` coverage → refiner
+> fix → golden-master CI → …), current state, run commands, and gotchas.
+
 ## Status: Outpost accuracy benchmark + eval fixes — in progress 2026-05-28
 
 Standing up the multi-wave "next wave" effort on `feat/next-wave`, keystone

From 7cb8dd5a4b115f1f4f5f2bfe3ab6a345666997b8 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 20:30:50 -0600
Subject: [PATCH 6/8] docs: reframe roadmap/handoff around the Mippy
 calibration-oracle feature set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Amendment to make the e2e agent's job explicit: make the full model a reliable
calibration oracle — runnable, MIP coefficients exposed as named-outputs, no
stubbed value cells. New priority order (issues on ebootheee/excel-to-engine):

  P1 · #23 + #24  reliably emit a runnable engine.js (fix dep-graph OOM; fail
                  loud, never a partial artifact; lock layout + content hash)
  P2 · #25        pin value-bearing cells (MIP Proceeds, hurdle, participation%,
                  equity basis, valuation/shares) as named-outputs
  P2 · #26        emit _fn-fallbacks.json; assert no value cell uses a stub
  P3 · #22        output-cone scoping (nice-to-have)

Supporting (trustworthiness, off critical path): golden-master CI, refiner
UW-Comparison fix, deeper _fn coverage, cluster-once eval.

HANDOFF.md now leads with this; ROADMAP gets a "Now — Mippy calibration oracle"
section; project_mippy_contract memory updated.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md |  19 +++++
 HANDOFF.md   | 197 ++++++++++++++++++++++++---------------------------
 ROADMAP.md   |  32 +++++++++
 3 files changed, 145 insertions(+), 103 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3e7bcd..f4f92e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,24 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Mippy calibration-oracle feature set (priority amendment)
+
+Refined the "fully ready for Mippy" target: the e2e agent's job is to make the
+full model a **reliable calibration oracle** — runnable, MIP coefficients exposed
+as named-outputs, no stubbed value cells. Documented the priority order in
+ROADMAP ("Now — Mippy calibration oracle") and HANDOFF.md, and in the
+`project_mippy_contract` memory:
+
+- **P1 · #23 + #24** — reliably emit a runnable `engine.js` (fix dep-graph OOM;
+  fail loud, never a partial artifact; lock layout + content hash).
+- **P2 · #25** — pin value-bearing cells (per-class MIP Proceeds, hurdle,
+  participation %, equity basis, valuation/shares) as named-outputs.
+- **P2 · #26** — emit `_fn-fallbacks.json`; assert no value cell uses an
+  unsupported-function stub.
+- **P3 · #22** — output-cone scoping (nice-to-have).
+
+Supporting/trustworthiness (off critical path): golden-master CI, refiner
+UW-Comparison fix, deeper `_fn` coverage, cluster-once eval.
+
 ## 2026-05-28 — HANDOFF.md (fresh-agent entry point)
 
 Added `HANDOFF.md` — the prioritized next-session plan (P0 cluster-once eval →
diff --git a/HANDOFF.md b/HANDOFF.md
index 620b3b7..06e3149 100644
--- a/HANDOFF.md
+++ b/HANDOFF.md
@@ -6,6 +6,13 @@ memory files (`project_outpost_models_shape`, `project_mippy_contract`).
 
 _Last updated: 2026-05-28._
 
+## The job, in one line
+
+**Make the full Outpost model a reliable Mippy calibration oracle: runnable,
+with the MIP coefficients exposed as named-outputs, and no stubbed value cells.**
+Everything Mippy-specific stays in Mippy — this repo just produces a trustworthy,
+sample-able engine + contract.
+
 ## Where things stand
 
 **Merged to `main` this session:** artifact slimming (#17), GitHub Actions CI
@@ -20,120 +27,104 @@ and the Mippy regeneration findings in ROADMAP. **If #21 isn't merged yet, branc
 off `feat/next-wave`; otherwise off `main`.**
 
 **Baseline (real models, `npm run bench:outpost`):** outpost-a1 **84.3%**,
-outpost-a2 **85.5%** — but **standalone sheets only**. The 17-sheet circular
-cluster and the 190 MB PP&E sheet are skipped, so ~80% of each model is currently
-unmeasured (see P0 below).
+outpost-a2 **85.5%** — standalone sheets only (cluster + 190 MB PP&E skipped).
 
 ## How to run
 
 ```bash
-npm test                 # full JS suite (387 assertions): lib, cli, manifest, eval, etc.
+npm test                 # full JS suite (387 assertions)
 npm run smoke            # chunked-engine accuracy 78/78
 npm run bench:outpost --  --root "<abs path>/engines"   # accuracy + efficacy on the real models
-# per-sheet accuracy on one engine (skips clusters by default in the bench):
 node eval/per-sheet-eval.mjs <chunkedDir> --concurrency 3 [--skip-clusters]
-cd pipelines/rust && cargo build --release   # the parser (needed by smoke/slimming/bench)
+cd pipelines/rust && cargo build --release   # the parser
 ```
 
-The real Outpost models live in the **gitignored** `engines/` dir (proprietary —
+Real Outpost models live in the **gitignored** `engines/` dir (proprietary —
 never commit values/labels). The Mippy agent's fresh regen is in
-`engines/outpost-a{1,2}-v2/` (the *better* build: dates fixed, slimmed); the old
-build is alongside in `engines/outpost-a{1,2}/`.
-
-## Prioritized backlog (do in this order)
-
-### P0 — Cluster-once eval (THE keystone) ★ highest impact
-Unblocks measuring ~80% of each model. The circular cluster is **17 of 21
-sheets** (and the array-formula `Headcount` sheet is *inside* it). Today
-`eval/per-sheet-eval.mjs` re-runs the **entire** cluster convergence once per
-member sheet (17×) → it won't finish on the real model, so the benchmark runs
-`--skip-clusters`. **Fix:** one task per cluster — converge once, score every
-member from that converged state — then drop `--skip-clusters` from the
-benchmark and re-baseline.
-- Files: `eval/per-sheet-eval.mjs` (task-building loop ~120-185, `evalOneSheet`
-  ~190-365, the cluster `evalScript` template ~230-320, aggregation ~375-390).
-- Test oracle READY: `tests/cli/fixtures/cluster-model/` (synthetic SheetA↔SheetB
-  converging to a=50,b=50,c=100,d=100). Add a cluster-once assertion to
-  `tests/cli/test-per-sheet-eval.mjs` (it already runs that fixture).
-- Validate: fixture 100% + smoke unaffected + then a real `--with-clusters` run
-  on `engines/outpost-a1-v2` completes in reasonable time and yields a cluster
-  accuracy. The scoped-diff (already landed) helps but is NOT sufficient alone.
-
-### P0/P1 — Generation robustness on big models (issue #23) — blocks clean builds
-A clean `ete init` on a real model **does not complete**: the Rust parser is
-OOM-killed at the cell-level dependency-graph step, and `ete init`'s 10-min
-`spawnSync` cap times out → `engine.js` (the `run()` orchestrator) and
-`dependency-graph.json` closures don't land (they're written after the OOM
-step); the Mippy regen worked around it with direct-parse + `--reuse-parse`.
-- Fix directions: stream/incrementalize the dep-graph build or raise its memory
-  headroom; within-sheet parallelism; streaming writes; configurable init
-  timeout. Mostly `pipelines/rust/` + `cli/commands/init.mjs` timeout.
-- Impact: until fixed, the downstream consumer can't get a clean full artifact
-  set (closures + orchestrator) from one command.
-
-### P1 — Transpiler coverage: 11,813 `_fn()` fallbacks ★ big accuracy lever
-That many formula cells per engine transpile to a generic unsupported-function
-stub (unchanged old→new, so it predates this work). Almost certainly a large
-slice of the ~15% standalone-sheet gap. **Inventory which Excel functions hit
-the `_fn()` fallback, rank by frequency, implement the top offenders.**
-- Files: `pipelines/rust/src/` (transpiler). Measure on `engines/*-v2`.
-
-### P1 — Refiner mis-maps returns to the "UW Comparison" tab ★ quick + concrete
-Auto-manifest picks an underwriting-comparison cell (2.305x) over the canonical
-Version Tracker returns (2.349x) because `SUMMARY_SHEET_PATTERN` over-ranks
-"UW Comparison" — forcing manual per-model pinning. Make the refiner recognize
-canonical returns / "Version Tracker" tabs, or de-prioritize
-underwriting-comparison tabs.
-- File: `cli/commands/manifest-refine.mjs` (`SUMMARY_SHEET_PATTERN` line ~24,
-  ranking in `searchForFieldIndexed`). Add/extend a manifest **invariant** so it
-  can't silently revert. Validate with `tests/cli/test-refine-label-index.mjs`.
-
-### P1 — Golden-master CI assert ★ near-free regression guard
-A-1's regenerated ground truth reproduces the hand-port's canonical returns to
-full float precision (Version Tracker row 22: grossMOIC L22 ≈2.34916, grossIRR
-M22 ≈0.19233, netMOIC T22 ≈2.23137, netIRR U22 ≈0.18240). The committed
-`named-outputs.json` `baseCaseValue`s (for a pinned A-1 manifest) make a ready
-golden-master. **Add a CI test that diffs those against the known values.** Note:
-the engine artifacts are gitignored; commit only the small contract JSON (or
-hard-code the canonical values in the test).
-
-### P2 — `--output-profile` / guided `ete create` (issue #22)
-Skip the ~752 MB per-sheet engine emit when a consumer only needs ground truth +
-contract maps. Scope artifacts to the actual need.
-
-### P2 — Large-sheet eval (190 MB PP&E)
-Exceeds the 150 MB per-sheet limit in `per-sheet-eval` → skipped. Needs
-streaming/sharded per-sheet eval or a higher limit with chunked compute.
-
-### P2 — Manifest-pipeline perf on ~6M-cell models
-`generate` detectors, `maps` cell-type pass, and `refine`'s `buildLabelIndex`
-fallback are O(N) on the full GT and slow. Profile + optimize. (Distinct from the
-Rust-side #23; this is the JS pipeline.)
-
-### P3 — Polish → Publish remainder
-lib/ unit tests done. Remaining: npm publish prep (`bin`, `files`, repo
-metadata), synthetic example project, contributing guide. (Arguably hold publish
-until accuracy blockers close.)
-
-### P3 — Lower priority / model-owner
-- `named-inputs.json` is empty when a workbook has no formula-referenced
-  defined-names (the Outpost case) — heuristic fallback or documented manual path.
-- MIP-as-output (request #7): modeled across per-block "MIP Proceeds" cells, not
-  a single GT cell — a model-owner question, surface via aggregate mapping.
+`engines/outpost-a{1,2}-v2/` (the *better* build: dates fixed, slimmed) alongside
+the old `engines/outpost-a{1,2}/`.
+
+## P1–P3 — Mippy calibration-oracle feature set (do in this order)
+
+All filed on ebootheee/excel-to-engine. Done-criteria are the contract.
+
+### P1 · #23 + #24 — reliably emit a runnable `engine.js` ★ blocks everything
+A clean `ete init` on a real model currently **does not finish**: the Rust parser
+is OOM-killed at the cell-level dependency-graph step, and `ete init` hits its
+10-min `spawnSync` cap → `engine.js` (the `run()` orchestrator) + the
+`dependency-graph.json` closures **don't land** (written after the OOM step).
+- **Done =** `chunked/engine.js` with `export function run()` exists on **every**
+  build; the build **errors hard** if it can't — **never a partial artifact**.
+- #24 also: **lock the artifact layout + emit a content hash** so downstream
+  consumes without per-version reconciliation.
+- Without a runnable engine we can't sample MIP to calibrate/validate — this
+  gates everything below.
+- Files: `pipelines/rust/` (dep-graph build: stream/incrementalize or raise
+  headroom; fail-loud), `cli/commands/init.mjs` (configurable timeout; don't
+  swallow a failed emit).
+
+### P2 · #25 — pin the value-bearing cells as named-outputs
+Per-class **MIP Proceeds**, **hurdle/threshold**, **participation %**, **equity
+basis**, **valuation / shares** — not just MOIC/IRR.
+- **Done =** those appear in `named-outputs.json` with base-case values. **These
+  ARE the parametric coefficients Mippy calibrates against.**
+- Files: `lib/manifest-maps.mjs` (`enumerateOutputCells` — extend beyond
+  MOIC/IRR/TV/carry; `customCells` is the current escape hatch),
+  `cli/commands/manifest*.mjs`. Pin per-model (the auto-manifest mis-maps —
+  see the refiner fix under "supporting").
+
+### P2 · #26 — `_fn` fallback audit: emit `_fn-fallbacks.json` (correctness gate)
+- **Done =** we can **assert no MIP / value / return cell resolves through an
+  unsupported-function stub.** (Auditing/gating the value cells — distinct from
+  fixing all 11,813 fallbacks, which is the deeper transpiler work below.)
+- Files: `pipelines/rust/` (emit the audit during transpile) + a check that the
+  P2/#25 named-output cells aren't in it.
+
+### P3 (nice-to-have) · #22 — output-cone scoping
+Scope generated artifacts to the consumer's need (skip the ~752 MB per-sheet
+emit). Makes the oracle cheaper to run; **not required** — we don't ship the blob.
+
+## Supporting work — makes the oracle *trustworthy* (after P1, alongside P2/P3)
+
+These aren't on Mippy's critical path but back the "reliable" in "reliable
+calibration oracle":
+- **Golden-master CI assert** — A-1's regenerated GT matches the hand-port's
+  canonical returns to full float precision (Version Tracker row 22: grossMOIC
+  L22 ≈2.34916, grossIRR M22 ≈0.19233, netMOIC T22 ≈2.23137, netIRR U22
+  ≈0.18240). Add a CI test diffing the committed `named-outputs.baseCaseValue`s
+  (or hard-coded values; engine artifacts are gitignored). Pairs with #25/#26.
+- **Refiner mis-maps returns to a "UW Comparison" tab** (2.305x vs canonical
+  2.349x) — `SUMMARY_SHEET_PATTERN` over-ranks it. Fix so #25's value cells pin
+  to canonical/Version-Tracker tabs without manual per-model pinning. Add a
+  manifest invariant. File: `cli/commands/manifest-refine.mjs`.
+- **Deeper transpiler coverage** — the 11,813 `_fn()` offenders behind #26's
+  audit; inventory by frequency, implement top ones. `pipelines/rust/src/`.
+- **Cluster-once eval** (our accuracy harness, not Mippy's path): the 17-sheet
+  cluster is unmeasured because `per-sheet-eval` re-runs the whole convergence
+  once per member (17×). Make it one task per cluster (converge once, score all),
+  then drop `--skip-clusters` and re-baseline. Lets us *verify* the oracle's
+  cluster math. Fixture oracle ready: `tests/cli/fixtures/cluster-model/`. (The
+  shipped `engine.js` `run()` converges clusters itself — this is measurement.)
+- **Large-sheet eval** (190 MB PP&E > 150 MB limit) and **manifest-pipeline
+  perf** (generate detectors / maps cell-types / refine fallback on ~6M cells).
+
+## Polish → Publish
+lib/ unit tests done. Remaining: npm publish prep (`bin`, `files`, metadata),
+synthetic example project, contributing guide. Lower: empty `named-inputs.json`
+fallback (no formula-referenced defined-names in the Outpost workbooks);
+MIP-as-output beyond the pinned cells is a model-owner question.
 
 ## Gotchas (will bite you)
 
-- **`engines/` is gitignored** (real financials). Read-only; report only
-  aggregate metrics. `_eval_tmp/` and `benchmarks/results/` are gitignored too.
+- **`engines/` is gitignored** (real financials). Read-only; aggregate metrics
+  only. `_eval_tmp/` + `benchmarks/results/` are gitignored too.
 - **`_computed-values.json` in these engines is a byte-identical COPY of ground
-  truth** (seeded). It is NOT a valid accuracy source — accuracy must be live
-  recompute (per-sheet-eval). The benchmark already avoids it.
-- **per-sheet-eval was Windows-broken** (bare absolute ESM import). Fixed via
-  `pathToFileURL`; guarded by `tests/cli/test-per-sheet-eval.mjs` on windows CI.
-  Don't reintroduce bare absolute `import` paths.
-- **`benchmarks/outpost-bench.mjs` `discoverModels()` gates on `engine.js`** —
-  but the `-v2` regen dirs may LACK `engine.js` (OOM, see #23) while having
-  `_graph.json` + `sheets/` (what per-sheet-eval actually needs). If the bench
-  skips `-v2`, relax the gate to `_graph.json` + `sheets/`.
-- **CI runs ubuntu + windows.** Anything touching child-process paths or the
-  parser binary must work on both.
+  truth** (seeded). NOT a valid accuracy source — use live recompute.
+- **per-sheet-eval was Windows-broken** (bare absolute ESM import → `pathToFileURL`
+  fix; guarded by `tests/cli/test-per-sheet-eval.mjs` on windows CI). Don't
+  reintroduce bare absolute `import` paths.
+- **`benchmarks/outpost-bench.mjs` `discoverModels()` gates on `engine.js`** — but
+  the `-v2` regen dirs may LACK it (the #23 OOM) while having `_graph.json` +
+  `sheets/`. If the bench skips `-v2`, relax the gate. (Fixing #23 makes this moot.)
+- **CI runs ubuntu + windows** — child-process/path/parser code must work on both.
+- After any change, update CHANGELOG/PLAN/ROADMAP per CLAUDE.md.
diff --git a/ROADMAP.md b/ROADMAP.md
index 6053cb9..3dcf872 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -42,6 +42,38 @@ RPC service. Round 1 split into a low-risk JS half (done) and a Rust half
   (engine faithfully reproduces an Excel base case of 0). Surface via a
   `requiredFor` field if/when named-inputs gains one.
 
+## Now — Mippy calibration oracle (e2e agent's job, priority feature set)
+
+The refined "fully ready for Mippy" target: make the full model a **reliable
+calibration oracle** — runnable, with the MIP coefficients exposed as
+named-outputs, and no stubbed value cells. Everything Mippy-specific stays in
+Mippy. Order (issues on ebootheee/excel-to-engine; the Done line is the contract):
+
+- **P1 · [#23] + [#24] — reliably emit a runnable `engine.js`.** Fix the
+  dep-graph OOM; **fail the build loud, never emit a partial artifact.**
+  **Done =** `chunked/engine.js` with `export function run()` exists on every
+  build; build errors hard if it can't. #24 also locks the artifact layout +
+  emits a content hash (consume downstream without per-version reconciliation).
+  Gates everything: without a runnable engine we can't sample MIP to calibrate.
+- **P2 · [#25] — pin the value-bearing cells as named-outputs.** Per-class MIP
+  Proceeds, hurdle/threshold, participation %, equity basis, valuation/shares —
+  not just MOIC/IRR. **Done =** they appear in `named-outputs.json` with
+  base-case values. These ARE the parametric coefficients.
+- **P2 · [#26] — `_fn` fallback audit (`_fn-fallbacks.json`).** **Done =** assert
+  no MIP/value/return cell resolves through an unsupported-function stub.
+- **P3 (nice-to-have) · [#22] — output-cone scoping.** Cheaper oracle; not
+  required (we don't ship the blob).
+
+Supporting (makes the oracle trustworthy, not on the critical path): golden-master
+CI assert (A-1 canonical returns), the refiner UW-Comparison fix (so #25's cells
+pin to canonical tabs), deeper transpiler coverage (the 11,813 `_fn` offenders
+behind #26), cluster-once eval (our accuracy harness), large-sheet eval, pipeline
+perf. See `HANDOFF.md` for the full ordering + Done criteria.
+
+[#24]: https://github.com/ebootheee/excel-to-engine/issues/24
+[#25]: https://github.com/ebootheee/excel-to-engine/issues/25
+[#26]: https://github.com/ebootheee/excel-to-engine/issues/26
+
 ## Now — Outpost regeneration findings (Mippy consumer, 2026-05-28)
 
 The downstream Mippy agent regenerated both Outpost engines from `main` (current

From ad55ca35f7af93e4d3a3cc77568307b58c3e30d4 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 20:39:39 -0600
Subject: [PATCH 7/8] docs: scrub real return figures from public docs
 (no-real-financials rule)

Removed the specific Outpost A-1 gross/net MOIC & IRR values, the UW-comparison
multiple, and the MIP dollar figure from HANDOFF.md + ROADMAP.md. main is a
public OSS repo and CLAUDE.md forbids committing real financials. The findings
(golden-master match on Version Tracker row 22; refiner mis-maps to UW
Comparison; MIP is a hand-port calibration) are kept; only the figures are
removed. Canonical values stay in the gitignored artifacts + local project
memory and feed the golden-master test from there.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 HANDOFF.md | 16 ++++++++--------
 ROADMAP.md | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/HANDOFF.md b/HANDOFF.md
index 06e3149..619b4e5 100644
--- a/HANDOFF.md
+++ b/HANDOFF.md
@@ -89,14 +89,14 @@ emit). Makes the oracle cheaper to run; **not required** — we don't ship the b
 These aren't on Mippy's critical path but back the "reliable" in "reliable
 calibration oracle":
 - **Golden-master CI assert** — A-1's regenerated GT matches the hand-port's
-  canonical returns to full float precision (Version Tracker row 22: grossMOIC
-  L22 ≈2.34916, grossIRR M22 ≈0.19233, netMOIC T22 ≈2.23137, netIRR U22
-  ≈0.18240). Add a CI test diffing the committed `named-outputs.baseCaseValue`s
-  (or hard-coded values; engine artifacts are gitignored). Pairs with #25/#26.
-- **Refiner mis-maps returns to a "UW Comparison" tab** (2.305x vs canonical
-  2.349x) — `SUMMARY_SHEET_PATTERN` over-ranks it. Fix so #25's value cells pin
-  to canonical/Version-Tracker tabs without manual per-model pinning. Add a
-  manifest invariant. File: `cli/commands/manifest-refine.mjs`.
+  canonical gross/net MOIC & IRR (Version Tracker row 22) to full float
+  precision. Add a CI test diffing those `named-outputs.baseCaseValue`s. The
+  canonical figures live in the gitignored `named-outputs.json` + project memory
+  — **do NOT commit the figures to this public repo.** Pairs with #25/#26.
+- **Refiner mis-maps returns to a "UW Comparison" tab** instead of the canonical
+  Version Tracker returns — `SUMMARY_SHEET_PATTERN` over-ranks it. Fix so #25's
+  value cells pin to canonical/Version-Tracker tabs without manual per-model
+  pinning. Add a manifest invariant. File: `cli/commands/manifest-refine.mjs`.
 - **Deeper transpiler coverage** — the 11,813 `_fn()` offenders behind #26's
   audit; inventory by frequency, implement top ones. `pipelines/rust/src/`.
 - **Cluster-once eval** (our accuracy harness, not Mippy's path): the 17-sheet
diff --git a/ROADMAP.md b/ROADMAP.md
index 3dcf872..b5a6f38 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -89,11 +89,11 @@ Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
 - Semantic manifest + ADR-017 contract maps emitted; circular refs now run
   per-cluster fixed-point loops.
 - **Golden master PASS.** Regenerated `_ground-truth.json` reproduces the
-  hand-port's canonical A-1 returns to full float precision (Version Tracker
-  row 22: grossMOIC L22 ≈2.34916, grossIRR M22 ≈0.19233, netMOIC T22 ≈2.23137,
-  netIRR U22 ≈0.18240). Pinning A-1's manifest to those cells makes
+  hand-port's canonical A-1 gross/net MOIC & IRR (Version Tracker row 22) to full
+  float precision. Pinning A-1's manifest to those cells makes
   `named-outputs.baseCaseValue` a ready CI golden-master assert. **Do this:** add
-  a golden-master test that diffs the committed contract JSON's baseCaseValues.
+  a golden-master test diffing those baseCaseValues. (Canonical figures stay in
+  the gitignored artifacts + project memory — not committed to this public repo.)
 
 **Open follow-ups:**
 - **Generation robustness on big models ([#23]) — blocks a clean full build.**
@@ -112,14 +112,14 @@ Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
   Inventory the missing Excel functions and prioritize by frequency. (See
   Transpiler Coverage below.)
 - **Refiner mis-maps returns to the "UW Comparison" tab.** Auto-manifest picked an
-  underwriting-comparison cell (2.305x) over the canonical Version Tracker returns
-  (2.349x) — `SUMMARY_SHEET_PATTERN` over-ranks "UW Comparison". The refiner
-  should recognize canonical returns / Version-Tracker tabs (or de-prioritize
+  underwriting-comparison cell over the canonical Version Tracker returns —
+  `SUMMARY_SHEET_PATTERN` over-ranks "UW Comparison". The refiner should recognize
+  canonical returns / Version-Tracker tabs (or de-prioritize
   underwriting-comparison tabs) so returns don't need manual per-model pinning.
 - **`named-inputs.json` empty** when a workbook exposes no formula-referenced
   defined-names (the Outpost case) — ADR-019 ranged inputs can't be auto-derived;
   needs a heuristic fallback or a documented manual-input path.
-- **MIP isn't a generated output (request #7).** The $51.8M is a hand-port
+- **MIP isn't a generated output (request #7).** The MIP figure is a hand-port
   calibration, not a single GT cell — MIP is modeled across per-block "MIP
   Proceeds" cells. Surface via a `requiredFor`/aggregate mapping, not a
   single-cell expectation. (See the Round 2 MIP-gating note above.)

From 1d945d6c08a2d0ec27510c8e39de6d91b231f125 Mon Sep 17 00:00:00 2001
From: Eric Boothe <ericboothe@gmail.com>
Date: Thu, 28 May 2026 20:51:06 -0600
Subject: [PATCH 8/8] chore(privacy): genericize the real model name + scrub
 return figures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Public repo: no real financials or participant names (CLAUDE.md). Before merging
the next-wave PR:

- Renamed benchmarks/outpost-bench.mjs -> benchmarks/bench.mjs (npm script `bench`);
  the benchmark now anonymizes model identity in printed + committed output
  (Model A/B) — real dir names stay only in the gitignored detail JSON.
- Scrubbed the real model name from all committed docs (HANDOFF/ROADMAP/PLAN/
  CHANGELOG/BASELINE) -> "the real PE models" / "Model A/B". Kept the
  test-e2e4-fixes scrub-guard that asserts template names are generic.
- Removed the real return figures (MOIC/IRR, UW multiple, MIP $) committed
  earlier; findings stay, numbers live only in gitignored artifacts + local notes.

Full npm test (387) + smoke green; BASELINE.md regenerated anonymized.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                | 37 ++++++++++++++++-----
 HANDOFF.md                                  | 25 +++++++-------
 PLAN.md                                     | 10 +++---
 ROADMAP.md                                  | 14 ++++----
 benchmarks/BASELINE.md                      | 12 +++----
 benchmarks/{outpost-bench.mjs => bench.mjs} | 28 ++++++++++------
 package.json                                |  2 +-
 tests/cli/test-per-sheet-eval.mjs           |  2 +-
 8 files changed, 78 insertions(+), 52 deletions(-)
 rename benchmarks/{outpost-bench.mjs => bench.mjs} (85%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4f92e1..8d3b13b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,24 @@
 # excel-to-engine — Changelog
 
+## 2026-05-28 — Privacy scrub: genericize the real model name + figures
+
+This repo is public; CLAUDE.md forbids committing real financials or participant
+names. Two cleanups before merging the next-wave PR:
+
+- **Removed the real return figures** (gross/net MOIC & IRR, the UW-comparison
+  multiple, the MIP dollar amount) from the committed docs. The findings stay
+  (golden-master match on Version Tracker row 22; refiner UW-Comparison mis-map;
+  MIP is a hand-port calibration) — only the numbers are gone. Canonical values
+  live in the gitignored artifacts + local notes and feed the golden-master test
+  from there.
+- **Genericized the real model name** out of all committed files: renamed
+  `benchmarks/outpost-bench.mjs` → `benchmarks/bench.mjs` (npm script `bench`),
+  and the benchmark now **anonymizes model identity** in printed + committed
+  output (Model A, Model B, …) — real dir names stay only in the gitignored
+  detail JSON. Prose in HANDOFF/ROADMAP/PLAN/CHANGELOG now says "the real PE
+  models" / "Model A/B". (The `test-e2e4-fixes` scrub-guard that asserts template
+  names are generic is intentionally kept.)
+
 ## 2026-05-28 — Mippy calibration-oracle feature set (priority amendment)
 
 Refined the "fully ready for Mippy" target: the e2e agent's job is to make the
@@ -28,10 +47,10 @@ state, run commands, and the gotchas (gitignored real models, the GT-copy
 `_computed-values.json`, the per-sheet-eval Windows fix, the bench
 `discoverModels` gate vs the `-v2` regen). PLAN points to it.
 
-## 2026-05-28 — Roadmap: Outpost regeneration findings (Mippy consumer)
+## 2026-05-28 — Roadmap: PE-model regeneration findings (Mippy consumer)
 
-The downstream Mippy agent regenerated both Outpost engines from `main` and
-reported back. Captured the findings in ROADMAP.md ("Now — Outpost regeneration
+The downstream Mippy agent regenerated both PE engines from `main` and
+reported back. Captured the findings in ROADMAP.md ("Now — PE-model regeneration
 findings"). Confirmed wins vs the old build: **dates fixed** (old leaked
 `ExcelDateTime { … }` debug strings — 2,686 in A-1; new emits serial numbers, 0
 leaks), **~42–45% smaller** (model-map.json + the GT-copy `_computed-values.json`
@@ -89,20 +108,20 @@ The shared financial libraries had no direct coverage. Added
   suggested corrective factor.
 - **`lib/sensitivity.mjs`** — `flattenOutputs` group/type filtering.
 
-## 2026-05-28 — Outpost accuracy benchmark + eval-tooling fixes
+## 2026-05-28 — PE-model accuracy benchmark + eval-tooling fixes
 
 Stood up a repeatable accuracy + efficacy benchmark over the real ~200 MB
-Outpost models so improvements can be tracked over time, and fixed the eval
+PE models so improvements can be tracked over time, and fixed the eval
 tooling that was silently broken on them.
 
-### Benchmark (`benchmarks/outpost-bench.mjs`, `npm run bench:outpost`)
+### Benchmark (`benchmarks/bench.mjs`, `npm run bench`)
 
 - Wraps `eval/per-sheet-eval.mjs` (live engine-vs-ground-truth) for every model
   under a root dir; reports overall accuracy, per-sheet pass/skip counts, and
   timings. **Aggregate-only** results go to the committed `benchmarks/BASELINE.md`;
   full per-sheet detail stays in the gitignored `benchmarks/results/`. No cell
   value or label is ever committed.
-- **Baseline (2026-05-28):** outpost-a1 **84.3%**, outpost-a2 **85.5%** on the
+- **Baseline (2026-05-28):** Model A **84.3%**, Model B **85.5%** on the
   standalone sheets. (The 17-sheet circular cluster and the 190 MB PP&E sheet are
   skipped for now — see below.)
 
@@ -143,7 +162,7 @@ ai-interface suites green).
 The real driver behind the "~2.5 min" refine loop wasn't one command — it was
 that `ete init` runs **generate → refine → doctor → maps** in sequence and
 **each independently re-read and re-parsed the full ground truth** from disk. On
-the real ~200 MB Outpost models that's four parses of a 200 MB+ file at ~3.6 s
+the real ~200 MB PE models that's four parses of a 200 MB+ file at ~3.6 s
 each, plus each command's own O(N) scan.
 
 ### What changed
@@ -161,7 +180,7 @@ each, plus each command's own O(N) scan.
 
 ### Why not the row-values artifact (Tier B)
 
-Measured on both real ~200 MB Outpost models: they're **dense-label** (≈90% of
+Measured on both real ~200 MB PE models: they're **dense-label** (≈90% of
 rows labeled, ≈93% of numerics on labeled rows), not the giant-grid case Tier B's
 big win assumed. A general row-values artifact would be ≈30% of GT (≈60% of the
 post-#17 compact GT) — only ~1.6× on refine while inflating output ~60%, fighting
diff --git a/HANDOFF.md b/HANDOFF.md
index 619b4e5..d1a1703 100644
--- a/HANDOFF.md
+++ b/HANDOFF.md
@@ -1,14 +1,15 @@
 # HANDOFF — excel-to-engine next session
 
 Start-here doc for a fresh agent. Read this, then `ROADMAP.md` (full backlog),
-`PLAN.md` (status), `benchmarks/BASELINE.md` (accuracy numbers), and the two
-memory files (`project_outpost_models_shape`, `project_mippy_contract`).
+`PLAN.md` (status), `benchmarks/BASELINE.md` (accuracy numbers), and your two
+project memory files (the Mippy contract + the real-model shape/baseline notes,
+auto-loaded from your memory index).
 
 _Last updated: 2026-05-28._
 
 ## The job, in one line
 
-**Make the full Outpost model a reliable Mippy calibration oracle: runnable,
+**Make the full PE model a reliable Mippy calibration oracle: runnable,
 with the MIP coefficients exposed as named-outputs, and no stubbed value cells.**
 Everything Mippy-specific stays in Mippy — this repo just produces a trustworthy,
 sample-able engine + contract.
@@ -20,29 +21,29 @@ sample-able engine + contract.
 single-GT-parse per `init` (#20).
 
 **Open PR — review/merge first:** **#21 `feat/next-wave`** (CI green). Contains
-the Outpost accuracy **benchmark + baseline**, a **per-sheet-eval Windows crash
+the the PE model accuracy **benchmark + baseline**, a **per-sheet-eval Windows crash
 fix**, `searchByLabel` lazy numerics, **lib/ unit tests** (43), the
 **scoped cluster-convergence diff** + the first circular-cluster fixture/test,
 and the Mippy regeneration findings in ROADMAP. **If #21 isn't merged yet, branch
 off `feat/next-wave`; otherwise off `main`.**
 
-**Baseline (real models, `npm run bench:outpost`):** outpost-a1 **84.3%**,
-outpost-a2 **85.5%** — standalone sheets only (cluster + 190 MB PP&E skipped).
+**Baseline (real models, `npm run bench`):** Model A **84.3%**,
+Model B **85.5%** — standalone sheets only (cluster + 190 MB PP&E skipped).
 
 ## How to run
 
 ```bash
 npm test                 # full JS suite (387 assertions)
 npm run smoke            # chunked-engine accuracy 78/78
-npm run bench:outpost --  --root "<abs path>/engines"   # accuracy + efficacy on the real models
+npm run bench --  --root "<abs path>/engines"   # accuracy + efficacy on the real models
 node eval/per-sheet-eval.mjs <chunkedDir> --concurrency 3 [--skip-clusters]
 cd pipelines/rust && cargo build --release   # the parser
 ```
 
-Real Outpost models live in the **gitignored** `engines/` dir (proprietary —
+The real PE models live in the **gitignored** `engines/` dir (proprietary —
 never commit values/labels). The Mippy agent's fresh regen is in
-`engines/outpost-a{1,2}-v2/` (the *better* build: dates fixed, slimmed) alongside
-the old `engines/outpost-a{1,2}/`.
+`the regenerated `-v2` engine dirs` (the *better* build: dates fixed, slimmed) alongside
+the old `the `engines/` model dirs`.
 
 ## P1–P3 — Mippy calibration-oracle feature set (do in this order)
 
@@ -111,7 +112,7 @@ calibration oracle":
 ## Polish → Publish
 lib/ unit tests done. Remaining: npm publish prep (`bin`, `files`, metadata),
 synthetic example project, contributing guide. Lower: empty `named-inputs.json`
-fallback (no formula-referenced defined-names in the Outpost workbooks);
+fallback (no formula-referenced defined-names in the PE workbooks);
 MIP-as-output beyond the pinned cells is a model-owner question.
 
 ## Gotchas (will bite you)
@@ -123,7 +124,7 @@ MIP-as-output beyond the pinned cells is a model-owner question.
 - **per-sheet-eval was Windows-broken** (bare absolute ESM import → `pathToFileURL`
   fix; guarded by `tests/cli/test-per-sheet-eval.mjs` on windows CI). Don't
   reintroduce bare absolute `import` paths.
-- **`benchmarks/outpost-bench.mjs` `discoverModels()` gates on `engine.js`** — but
+- **`benchmarks/bench.mjs` `discoverModels()` gates on `engine.js`** — but
   the `-v2` regen dirs may LACK it (the #23 OOM) while having `_graph.json` +
   `sheets/`. If the bench skips `-v2`, relax the gate. (Fixing #23 makes this moot.)
 - **CI runs ubuntu + windows** — child-process/path/parser code must work on both.
diff --git a/PLAN.md b/PLAN.md
index 331f080..0a28e8a 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -4,13 +4,13 @@
 > (P0 cluster-once eval → generation robustness → `_fn()` coverage → refiner
 > fix → golden-master CI → …), current state, run commands, and gotchas.
 
-## Status: Outpost accuracy benchmark + eval fixes — in progress 2026-05-28
+## Status: PE-model accuracy benchmark + eval fixes — in progress 2026-05-28
 
 Standing up the multi-wave "next wave" effort on `feat/next-wave`, keystone
-first: a repeatable accuracy + efficacy benchmark over the real Outpost models
-(`benchmarks/outpost-bench.mjs` → `benchmarks/BASELINE.md`, aggregate-only).
+first: a repeatable accuracy + efficacy benchmark over the real PE models
+(`benchmarks/bench.mjs` → `benchmarks/BASELINE.md`, aggregate-only).
 
-**Baseline:** outpost-a1 84.3%, outpost-a2 85.5% on standalone sheets; the
+**Baseline:** Model A 84.3%, Model B 85.5% on standalone sheets; the
 17-sheet circular cluster and the 190 MB PP&E sheet are skipped pending deeper
 fixes. Landed alongside: a **Windows crash fix** in `per-sheet-eval` (bare
 absolute ESM import → `pathToFileURL`; it had silently zeroed accuracy on
@@ -33,7 +33,7 @@ a `--skip-clusters` flag, and the **searchByLabel lazy-numerics** wave
 `ete init` now loads the ground truth once and shares the parsed object across
 the whole manifest pipeline — generate → refine → doctor → maps — instead of
 each step re-reading and re-parsing the full ground truth from disk (up to four
-parses of a 200 MB+ file; ~3.6 s per parse on the real Outpost models). This was
+parses of a 200 MB+ file; ~3.6 s per parse on the real PE models). This was
 the dominant cost of init on large models and the real driver behind the
 "~2.5 min" refine loop. The GT is read-only in all four consumers, so a single
 shared object is safe; each command falls back to loading the GT itself when no
diff --git a/ROADMAP.md b/ROADMAP.md
index b5a6f38..b87c220 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -74,10 +74,10 @@ perf. See `HANDOFF.md` for the full ordering + Done criteria.
 [#25]: https://github.com/ebootheee/excel-to-engine/issues/25
 [#26]: https://github.com/ebootheee/excel-to-engine/issues/26
 
-## Now — Outpost regeneration findings (Mippy consumer, 2026-05-28)
+## Now — PE-model regeneration findings (Mippy consumer, 2026-05-28)
 
-The downstream Mippy agent regenerated both Outpost engines from `main` (current
-excel-to-engine) → `engines/outpost-a{1,2}-v2/`, old build left alongside.
+The downstream Mippy agent regenerated both PE engines from `main` (current
+excel-to-engine) → `the regenerated `-v2` engine dirs`, old build left alongside.
 Confirmed the new build is clearly better and surfaced concrete follow-ups.
 Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
 
@@ -117,7 +117,7 @@ Issues filed: [#22] (output scoping) and [#23] (parser/emitter perf).
   canonical returns / Version-Tracker tabs (or de-prioritize
   underwriting-comparison tabs) so returns don't need manual per-model pinning.
 - **`named-inputs.json` empty** when a workbook exposes no formula-referenced
-  defined-names (the Outpost case) — ADR-019 ranged inputs can't be auto-derived;
+  defined-names (this case) — ADR-019 ranged inputs can't be auto-derived;
   needs a heuristic fallback or a documented manual-input path.
 - **MIP isn't a generated output (request #7).** The MIP figure is a hand-port
   calibration, not a single GT cell — MIP is modeled across per-block "MIP
@@ -173,7 +173,7 @@ when we next touch the monitor server or auth surface.
     generate → refine → doctor → maps (each previously re-parsed the full
     200 MB+ GT). The GT is read-only in all of them. `test-init-shared-gt`.
   - **Tier B (row-values artifact) — measured and deprioritized.** Gauged on
-    the two real ~200 MB Outpost models: both are **dense-label** (≈90% of rows
+    the two real ~200 MB PE models: both are **dense-label** (≈90% of rows
     labeled, ≈93% of numerics on labeled rows), *not* the giant-grid case the
     100× idea assumed. A general row-values artifact is ≈30% of GT (≈60% of the
     post-#17 compact GT) → only ~1.6× on refine while inflating output ~60%,
@@ -223,7 +223,7 @@ when we next touch the monitor server or auth surface.
 
 ### Transpiler Coverage
 - **Measured (Mippy regen, 2026-05-28): 11,813 `_fn()` unsupported-function
-  fallbacks per Outpost engine** — that many formula cells transpile to a generic
+  fallbacks per the PE model engine** — that many formula cells transpile to a generic
   stub instead of real logic, a prime accuracy suspect. First step: inventory
   which Excel functions hit the fallback and rank by frequency, then implement
   the top offenders. (Was unchanged old→new, so it predates our work.)
@@ -240,7 +240,7 @@ when we next touch the monitor server or auth surface.
 
 ### Eval System
 - **Done (2026-05-28):** repeatable accuracy + efficacy benchmark over the real
-  Outpost models — `benchmarks/outpost-bench.mjs` → `benchmarks/BASELINE.md`
+  PE models — `benchmarks/bench.mjs` → `benchmarks/BASELINE.md`
   (aggregate-only). Baseline: a1 84.3%, a2 85.5% on standalone sheets. Also
   **fixed a Windows crash** in `per-sheet-eval` (bare absolute ESM import →
   `pathToFileURL`; it had zeroed accuracy on Windows/real engines and wasn't in
diff --git a/benchmarks/BASELINE.md b/benchmarks/BASELINE.md
index e2a72bd..d51f825 100644
--- a/benchmarks/BASELINE.md
+++ b/benchmarks/BASELINE.md
@@ -1,4 +1,4 @@
-# Outpost benchmark — baseline & history
+# model benchmark — baseline & history
 
 Real accuracy: each standalone sheet recomputed live vs ground truth via
 `eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).
@@ -6,19 +6,19 @@ Circular-cluster sheets and oversized sheets are **skipped** for now (see
 the Skipped column + blockers below) pending the single-pass orchestrator
 eval; run with `--with-clusters` once that lands. Aggregate-only — no cell
 values or full sheet inventory. Regenerate:
-`node benchmarks/outpost-bench.mjs --root <engines>`. Full per-sheet detail
+`node benchmarks/bench.mjs --root <engines>`. Full per-sheet detail
 lands in the gitignored `benchmarks/results/`.
 
 _Last run: baseline-2026-05-28_
 
 | Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |
 |-------|---------:|------:|:-----------:|:-------:|----------:|---:|
-| outpost-a1 | 84.33% | 1491/1768 | 1/3 | 17 | 45s | 201.5 MB |
-| outpost-a2 | 85.54% | 1686/1971 | 2/4 | 17 | 48s | 211 MB |
+| Model A | 84.33% | 1491/1768 | 1/3 | 17 | 41s | 201.5 MB |
+| Model B | 85.54% | 1686/1971 | 2/4 | 17 | 45s | 211 MB |
 
 ## Known blocker categories
 
 Tracked by name because PLAN.md already calls them out; values are accuracy %, not financials.
 
-- **outpost-a1**: 1/3 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
-- **outpost-a2**: 2/4 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
+- **Model A**: 1/3 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
+- **Model B**: 2/4 sheets clean; blockers: Owned Asset PP&E (skipped: module too large (190MB > 150MB limit)); Headcount (skipped: circular cluster (--skip-clusters; needs single-pass orchestrator eval))
diff --git a/benchmarks/outpost-bench.mjs b/benchmarks/bench.mjs
similarity index 85%
rename from benchmarks/outpost-bench.mjs
rename to benchmarks/bench.mjs
index 23f8187..aab6304 100644
--- a/benchmarks/outpost-bench.mjs
+++ b/benchmarks/bench.mjs
@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 /**
- * Outpost benchmark — repeatable accuracy + efficacy tracking over the real
+ * model benchmark — repeatable accuracy + efficacy tracking over the real
  * models, so we can see whether each improvement actually moves the needle.
  *
  * It wraps `eval/per-sheet-eval.mjs` (which runs each sheet module live against
@@ -19,7 +19,7 @@
  * printed or committed.
  *
  * Usage:
- *   node benchmarks/outpost-bench.mjs [--root <dir>] [--concurrency 3] [--stamp <label>]
+ *   node benchmarks/bench.mjs [--root <dir>] [--concurrency 3] [--stamp <label>]
  *
  * Run it after any change that could affect accuracy or pipeline speed, then
  * diff benchmarks/BASELINE.md to see the delta.
@@ -102,15 +102,21 @@ async function benchModel(model) {
 const models = discoverModels(ROOT);
 if (models.length === 0) {
   console.error(`No models found under ${ROOT} (need <model>/chunked/engine.js + _ground-truth.json).`);
-  console.error('Point --root at a dir of parsed engines (the real Outpost engines live in the gitignored engines/).');
+  console.error('Point --root at a dir of parsed engines (the real PE engines live in the gitignored engines/).');
   process.exit(2);
 }
 
-console.log(`Outpost benchmark — ${models.length} model(s) under ${ROOT}\n`);
+// Anonymize model identity in printed + committed output (Model A, B, …). Real
+// dir names stay only in the gitignored detail JSON + the operator's local
+// notes — this repo is public and must not name the real models.
+models.forEach((m, i) => { m.label = `Model ${String.fromCharCode(65 + (i % 26))}`; });
+
+console.log(`model benchmark — ${models.length} model(s) under ${ROOT}\n`);
 const results = [];
 for (const m of models) {
-  process.stdout.write(`  ${m.name} ... `);
+  process.stdout.write(`  ${m.label} ... `);
   const r = await benchModel(m);
+  r.label = m.label;
   results.push(r);
   if (r.evalError && !r.summary) console.log(`eval FAILED (${r.evalError})`);
   else {
@@ -128,7 +134,7 @@ console.log(`\nDetail (gitignored): ${detailPath}`);
 // ── Committed aggregate (no values, no full sheet inventory) ──────────────────
 function renderBaseline(stamp, results) {
   const L = [];
-  L.push('# Outpost benchmark — baseline & history');
+  L.push('# model benchmark — baseline & history');
   L.push('');
   L.push('Real accuracy: each standalone sheet recomputed live vs ground truth via');
   L.push('`eval/per-sheet-eval.mjs` (numbers within 1% rel. tol, strings exact).');
@@ -136,7 +142,7 @@ function renderBaseline(stamp, results) {
   L.push('the Skipped column + blockers below) pending the single-pass orchestrator');
   L.push('eval; run with `--with-clusters` once that lands. Aggregate-only — no cell');
   L.push('values or full sheet inventory. Regenerate:');
-  L.push('`node benchmarks/outpost-bench.mjs --root <engines>`. Full per-sheet detail');
+  L.push('`node benchmarks/bench.mjs --root <engines>`. Full per-sheet detail');
   L.push('lands in the gitignored `benchmarks/results/`.');
   L.push('');
   L.push(`_Last run: ${stamp}_`);
@@ -144,9 +150,9 @@ function renderBaseline(stamp, results) {
   L.push('| Model | Accuracy | Cells matched | Sheets ≥95% | Skipped | Eval time | GT |');
   L.push('|-------|---------:|------:|:-----------:|:-------:|----------:|---:|');
   for (const r of results) {
-    if (!r.summary) { L.push(`| ${r.name} | eval failed | — | — | — | — | ${r.efficacy.gtSizeMB} MB |`); continue; }
+    if (!r.summary) { L.push(`| ${r.label} | eval failed | — | — | — | — | ${r.efficacy.gtSizeMB} MB |`); continue; }
     const s = r.summary;
-    L.push(`| ${r.name} | ${s.overallAccuracy}% | ${s.totalCellsCorrect}/${s.totalCellsTested} | ` +
+    L.push(`| ${r.label} | ${s.overallAccuracy}% | ${s.totalCellsCorrect}/${s.totalCellsTested} | ` +
       `${s.sheetsPassing}/${s.sheetsEvaluated} | ${s.sheetsSkipped} | ${(r.efficacy.wallMs / 1000).toFixed(0)}s | ${r.efficacy.gtSizeMB} MB |`);
   }
   L.push('');
@@ -156,12 +162,12 @@ function renderBaseline(stamp, results) {
   L.push('');
   const PUBLIC = [/pp&?e/i, /headcount/i];
   for (const r of results) {
-    if (!r.summary) { L.push(`- **${r.name}**: eval failed`); continue; }
+    if (!r.summary) { L.push(`- **${r.label}**: eval failed`); continue; }
     const blockers = [];
     for (const sk of r.skipped) if (PUBLIC.some(re => re.test(sk.name))) blockers.push(`${sk.name} (skipped: ${sk.reason})`);
     for (const sh of r.sheets) if (PUBLIC.some(re => re.test(sh.name))) blockers.push(`${sh.name} ${sh.accuracy}%`);
     const lowest = r.sheets.filter(s => s.status !== 'ok' || s.accuracy < 95).length;
-    L.push(`- **${r.name}**: ${r.summary.sheetsEvaluated - lowest}/${r.summary.sheetsEvaluated} sheets clean; ` +
+    L.push(`- **${r.label}**: ${r.summary.sheetsEvaluated - lowest}/${r.summary.sheetsEvaluated} sheets clean; ` +
       `blockers: ${blockers.join('; ') || 'none surfaced'}`);
   }
   L.push('');
diff --git a/package.json b/package.json
index 4e87a25..438660f 100644
--- a/package.json
+++ b/package.json
@@ -42,7 +42,7 @@
     "test:depgraph": "node pipelines/rust/tests/test-dependency-graph.mjs",
     "test:slimming": "node tests/cli/test-artifact-slimming.mjs",
     "test": "node tests/lib/test-lib.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs",
-    "bench:outpost": "node benchmarks/outpost-bench.mjs"
+    "bench": "node benchmarks/bench.mjs"
   },
   "devDependencies": {}
 }
diff --git a/tests/cli/test-per-sheet-eval.mjs b/tests/cli/test-per-sheet-eval.mjs
index 073df45..ea32d9c 100644
--- a/tests/cli/test-per-sheet-eval.mjs
+++ b/tests/cli/test-per-sheet-eval.mjs
@@ -10,7 +10,7 @@
  * smoke engine and asserts no sheet crashed and accuracy is the known-good 100%,
  * so the regression can't come back (CI runs it on ubuntu AND windows).
  *
- * Also exercises --skip-clusters (used by the Outpost benchmark).
+ * Also exercises --skip-clusters (used by the model benchmark).
  *
  * Pure JS; uses the committed smoke chunked fixture (no parser needed).
  */