From 3f9c04ff4bd041e38bad7ff364282b63b62399c4 Mon Sep 17 00:00:00 2001 From: Eric Boothe Date: Thu, 28 May 2026 14:46:48 -0600 Subject: [PATCH 1/2] perf(refine): consume _labels.json + lazy numeric probes, drop full-GT index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ete manifest refine` rebuilt a full label+numeric index over the entire ground truth on every run (buildIndex), even though it only ever inspects numerics on a matched label's own row. On big models the bulk of that work indexed giant *unlabeled* grids (e.g. a PP&E schedule) the refiner never consults. Now buildIndex: - sources labels from the Rust parser's chunked/_labels.json when present (O(labels), no GT scan), falling back to buildLabelIndex(gt) for legacy engines that predate the index; - resolves same-row numerics lazily by probing the row's columns on demand (numericsForRow), memoized per row, stopping after a long empty-column run. Behavior-preserving: ranking/dedup/value-range logic is untouched, so the existing manifest/ship-ready suites stay green. The remaining full pass is the unavoidable JSON parse of the ground truth (a follow-up could lift that with a parser-emitted row-values artifact; see ROADMAP). New tests/cli/test-refine-label-index.mjs (14): correctness off _labels.json, parity between the index path and the GT-scan fallback, lazy-probe far/gapped columns + value ranges, and a consumption proof (a label present only in the index — not as a GT string — is still resolved; the fallback provably cannot). Wired into `npm test`. Measured (synthetic giant-grid GT): the eliminated buildIndex pass alone was ~1.4s on 1.4M cells / ~7.9s on 6.4M cells; new refine completes end-to-end in less time than the old index build took, and the skipped work scales with total cell count. Co-Authored-By: Claude Opus 4.8 --- cli/commands/manifest-refine.mjs | 109 +++++++++++---- package.json | 2 +- tests/cli/test-refine-label-index.mjs | 183 ++++++++++++++++++++++++++ 3 files changed, 266 insertions(+), 28 deletions(-) create mode 100644 tests/cli/test-refine-label-index.mjs diff --git a/cli/commands/manifest-refine.mjs b/cli/commands/manifest-refine.mjs index 0f486f9..b5f3b77 100644 --- a/cli/commands/manifest-refine.mjs +++ b/cli/commands/manifest-refine.mjs @@ -11,7 +11,10 @@ import { readFileSync, writeFileSync, existsSync } from 'fs'; import { join } from 'path'; -import { loadManifest, loadGroundTruth, resolveCell, MANIFEST_VERSION } from '../../lib/manifest.mjs'; +import { + loadManifest, loadGroundTruth, resolveCell, MANIFEST_VERSION, + loadLabelIndex, buildLabelIndex, +} from '../../lib/manifest.mjs'; // --------------------------------------------------------------------------- // Required fields and their search strategies @@ -100,34 +103,73 @@ const REQUIRED_FIELDS = [ }, ]; +// Excel's hard column ceiling (XFD = 16384). numericsForRow probes a row's +// columns left-to-right and stops after this many consecutive empty columns — +// generous enough to span any realistic financial layout (and far-right +// restated copies lose to the canonical leftmost cell in ranking anyway), while +// bounding the probe cost on a label-only row to a few hundred hash lookups. +const MAX_PROBE_COL = 16384; +const MAX_PROBE_GAP = 256; + /** - * Build a pre-index of the ground truth for fast searching. - * Groups string labels by sheet+row and numeric values by sheet+row. + * Build a search index over the ground truth. + * + * Labels come from the Rust parser's pre-built index (`chunked/_labels.json`) + * when present — an O(labels) read instead of scanning every cell — and fall + * back to a one-time ground-truth scan (`buildLabelIndex`) for legacy engines + * that predate the index. + * + * Numeric values are resolved **lazily, per matched row**, by direct probing + * (see `numericsForRow`). The refiner only ever inspects numerics on a label's + * own row, so the old approach — bucketing every numeric in a multi-million-cell + * workbook up front — was almost entirely wasted: on a big model the bulk of + * those cells live in giant *unlabeled* grids (e.g. a PP&E depreciation + * schedule) the refiner never consults. Skipping that build is the win; the + * one remaining full pass is the unavoidable JSON parse of the ground truth. + * + * @param {Object} gt - Ground truth { addr: value } + * @param {string} [modelDir] - Model dir, for loading `_labels.json` + * @returns {{ labels: Array, numericsForRow: (sheet: string, row: number) => Array }} */ -function buildIndex(gt) { - const labels = []; // { addr, text, sheet, col, row } - const numsByRow = {}; // "sheet!row" → [{ addr, value, col }] - - for (const [addr, val] of Object.entries(gt)) { - const bang = addr.lastIndexOf('!'); - if (bang < 0) continue; - const sheet = addr.substring(0, bang); - const cellPart = addr.substring(bang + 1); - const match = cellPart.match(/^([A-Z]+)(\d+)$/); - if (!match) continue; - const col = match[1]; - const row = parseInt(match[2], 10); - const rowKey = `${sheet}!${row}`; - - if (typeof val === 'string' && val.length > 2 && val.length < 200) { - labels.push({ addr, text: val, sheet, col, row, rowKey }); - } else if (typeof val === 'number') { - if (!numsByRow[rowKey]) numsByRow[rowKey] = []; - numsByRow[rowKey].push({ addr, value: val, col }); +function buildIndex(gt, modelDir) { + const labelIndex = (modelDir && loadLabelIndex(modelDir)) || buildLabelIndex(gt); + const labels = []; + for (const entries of Object.values(labelIndex)) { + for (const e of entries) { + labels.push({ + addr: `${e.sheet}!${e.col}${e.row}`, + text: e.text, + sheet: e.sheet, + col: e.col, + row: e.row, + rowKey: `${e.sheet}!${e.row}`, + }); } } - return { labels, numsByRow }; + const rowCache = new Map(); // "sheet!row" → [{ addr, value, col }] + function numericsForRow(sheet, row) { + const key = `${sheet}!${row}`; + const cached = rowCache.get(key); + if (cached) return cached; + const nums = []; + let gap = 0; + for (let c = 1; c <= MAX_PROBE_COL && gap < MAX_PROBE_GAP; c++) { + const col = numToCol(c); + const addr = `${sheet}!${col}${row}`; + const v = gt[addr]; + if (typeof v === 'number') { + nums.push({ addr, value: v, col }); + gap = 0; + } else { + gap++; + } + } + rowCache.set(key, nums); + return nums; + } + + return { labels, numericsForRow }; } /** @@ -141,8 +183,9 @@ export function runManifestRefine(modelDir, args) { const manifest = loadManifest(modelDir); const gt = loadGroundTruth(manifest, modelDir); - // Pre-index for fast searching (single pass over GT) - const index = buildIndex(gt); + // Pre-index for fast searching. Labels come from `_labels.json` when the + // parser emitted it (no GT scan); numerics are probed lazily per matched row. + const index = buildIndex(gt, modelDir); // Resolve refinement hints: either passed in via args.hints (used by init // when a template has been applied), or read from a hand-edited manifest @@ -279,7 +322,7 @@ function searchForFieldIndexed(index, field, opts = {}) { // Pass 2: For each matching label, select the best same-row numeric cell. for (const lm of labelMatches) { - const rowNums = index.numsByRow[lm.rowKey] || []; + const rowNums = index.numericsForRow(lm.sheet, lm.row); const labelColNum = colToNum(lm.col); const inRange = rowNums.filter(n => { @@ -443,3 +486,15 @@ function colToNum(col) { } return num; } + +// Inverse of colToNum: 1 → "A", 26 → "Z", 27 → "AA". Used by numericsForRow to +// reconstruct cell addresses when probing a row's columns. +function numToCol(num) { + let col = ''; + while (num > 0) { + const rem = (num - 1) % 26; + col = String.fromCharCode(65 + rem) + col; + num = Math.floor((num - 1) / 26); + } + return col; +} diff --git a/package.json b/package.json index d37ab93..bd0501b 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "test:engine": "node pipelines/rust/tests/test-engine-runtime.mjs", "test:depgraph": "node pipelines/rust/tests/test-dependency-graph.mjs", "test:slimming": "node tests/cli/test-artifact-slimming.mjs", - "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs" + "test": "node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs" }, "devDependencies": {} } diff --git a/tests/cli/test-refine-label-index.mjs b/tests/cli/test-refine-label-index.mjs new file mode 100644 index 0000000..c718a2e --- /dev/null +++ b/tests/cli/test-refine-label-index.mjs @@ -0,0 +1,183 @@ +#!/usr/bin/env node +/** + * Tests for the refine label-index optimization. + * + * `ete manifest refine` now sources labels from the Rust parser's + * `chunked/_labels.json` when present (an O(labels) read instead of scanning + * every cell) and resolves same-row numerics lazily by probing the row's + * columns — rather than bucketing every numeric in a multi-million-cell + * workbook up front. These tests assert: + * + * 1. refine finds the key metrics off `_labels.json`; + * 2. it produces *identical* mappings whether `_labels.json` is present or it + * falls back to the legacy ground-truth scan (the optimization is + * behavior-preserving); + * 3. the lazy numeric probe handles far / gapped columns and respects each + * field's value range; + * 4. refine genuinely *consumes* `_labels.json` — a label that exists only in + * the index (not as a ground-truth string) is still resolved, which the + * GT-scan fallback provably cannot do. + * + * Pure JS — constructs the chunked artifacts directly, so it needs no parser. + * + * Usage: node tests/cli/test-refine-label-index.mjs + */ + +import { writeFileSync, mkdtempSync, rmSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { runManifestRefine } from '../../cli/commands/manifest-refine.mjs'; + +let passed = 0; +let failed = 0; +function assert(cond, msg) { if (cond) { passed++; } else { failed++; console.error(` FAIL: ${msg}`); } } + +const BASE_MANIFEST = { + manifestVersion: '1.0', + model: { groundTruth: './_ground-truth.json' }, + equity: { classes: [{}] }, + carry: {}, + outputs: {}, + baseCaseOutputs: {}, +}; + +// Write a self-contained chunked dir. Pass `labels: null` to omit _labels.json +// and exercise the legacy GT-scan fallback. +function makeDir({ gt, labels, manifest = BASE_MANIFEST }) { + const dir = mkdtempSync(join(tmpdir(), 'refine-idx-')); + writeFileSync(join(dir, '_ground-truth.json'), JSON.stringify(gt)); + if (labels) writeFileSync(join(dir, '_labels.json'), JSON.stringify(labels)); + writeFileSync(join(dir, 'manifest.json'), JSON.stringify(manifest, null, 2)); + return dir; +} + +// Build a _labels.json index ({ lower: [{sheet,col,row,text}] }) from +// [addr, text] pairs — the same shape the Rust parser emits. +function labelsFrom(pairs) { + const idx = {}; + for (const [addr, text] of pairs) { + const bang = addr.lastIndexOf('!'); + const sheet = addr.slice(0, bang); + const m = addr.slice(bang + 1).match(/^([A-Z]+)(\d+)$/); + (idx[text.toLowerCase()] ||= []).push({ sheet, col: m[1], row: +m[2], text }); + } + return idx; +} + +// A clean PE-summary ground truth: metric labels in col A, values in col C, +// plus a block of *unlabeled* numerics (a stand-in for a giant PP&E grid) that +// refine must never consult. +function summaryGt() { + const gt = { + 'Summary!A1': 'Gross IRR', 'Summary!C1': 0.185, + 'Summary!A2': 'Net IRR', 'Summary!C2': 0.151, + 'Summary!A3': 'Gross MOIC', 'Summary!C3': 2.85, + 'Summary!A4': 'Net MOIC', 'Summary!C4': 2.45, + 'Summary!A5': 'Peak Net Equity', 'Summary!C5': 270_000_000, + }; + for (let i = 1; i <= 50; i++) gt[`PPE!D${i}`] = 1000 + i; // unlabeled grid + return gt; +} + +const EXPECTED = { + 'Gross IRR': 'Summary!C1', + 'Net IRR': 'Summary!C2', + 'Gross MOIC': 'Summary!C3', + 'Net MOIC': 'Summary!C4', + 'Equity Basis / Peak Equity': 'Summary!C5', +}; + +// --------------------------------------------------------------------------- +// 1) Correctness — finds metrics via _labels.json +// --------------------------------------------------------------------------- +console.log('Testing: refine finds metrics via _labels.json'); +{ + const gt = summaryGt(); + const labels = labelsFrom([ + ['Summary!A1', 'Gross IRR'], ['Summary!A2', 'Net IRR'], + ['Summary!A3', 'Gross MOIC'], ['Summary!A4', 'Net MOIC'], + ['Summary!A5', 'Peak Net Equity'], + ]); + const dir = makeDir({ gt, labels }); + const r = runManifestRefine(dir, { apply: false }); + for (const [label, cell] of Object.entries(EXPECTED)) { + assert(r.found[label]?.cell === cell, `${label} -> ${cell} (got ${r.found[label]?.cell})`); + } + rmSync(dir, { recursive: true, force: true }); +} + +// --------------------------------------------------------------------------- +// 2) Parity — _labels.json path == legacy GT-scan fallback +// --------------------------------------------------------------------------- +console.log('Testing: identical result with _labels.json vs GT-scan fallback'); +{ + const gt = summaryGt(); + const labels = labelsFrom([ + ['Summary!A1', 'Gross IRR'], ['Summary!A2', 'Net IRR'], + ['Summary!A3', 'Gross MOIC'], ['Summary!A4', 'Net MOIC'], + ['Summary!A5', 'Peak Net Equity'], + ]); + const dirIdx = makeDir({ gt, labels }); + const dirScan = makeDir({ gt, labels: null }); // no _labels.json -> fallback + const withIdx = runManifestRefine(dirIdx, { apply: false }); + const fallback = runManifestRefine(dirScan, { apply: false }); + + assert(Object.keys(withIdx.found).length === Object.keys(fallback.found).length, + `same field count (idx ${Object.keys(withIdx.found).length} vs scan ${Object.keys(fallback.found).length})`); + for (const key of Object.keys(withIdx.found)) { + assert(withIdx.found[key].cell === fallback.found[key]?.cell, + `parity for ${key}: idx=${withIdx.found[key].cell} scan=${fallback.found[key]?.cell}`); + } + rmSync(dirIdx, { recursive: true, force: true }); + rmSync(dirScan, { recursive: true, force: true }); +} + +// --------------------------------------------------------------------------- +// 3) Lazy probe — far/gapped column + value-range filtering +// --------------------------------------------------------------------------- +console.log('Testing: lazy probe handles gapped far columns and value ranges'); +{ + // Exit Multiple's value sits in a far column (AA, gaps before it); a + // near-column decimal is out of the [1,50] range and must be rejected. + const gt = { + 'Summary!A1': 'Exit Multiple', + 'Summary!B1': 0.5, // out of range -> rejected + 'Summary!AA1': 18, // in range, far column -> selected + }; + const labels = labelsFrom([['Summary!A1', 'Exit Multiple']]); + const dir = makeDir({ gt, labels }); + const r = runManifestRefine(dir, { apply: false }); + assert(r.found['Exit Multiple']?.cell === 'Summary!AA1', + `far-column probe past gaps + range filter (got ${r.found['Exit Multiple']?.cell})`); + rmSync(dir, { recursive: true, force: true }); +} + +// --------------------------------------------------------------------------- +// 4) Consumption proof — a label present only in the index is still resolved +// --------------------------------------------------------------------------- +console.log('Testing: refine consumes _labels.json (label only in index, not GT)'); +{ + // No 'Summary!A7' label string in the GT — only the numeric. The label lives + // solely in _labels.json. Resolving it proves the index was the source. + const gt = { 'Summary!C7': 0.20 }; + const labels = labelsFrom([['Summary!A7', 'Gross IRR']]); + const dir = makeDir({ gt, labels }); + const r = runManifestRefine(dir, { apply: false }); + assert(r.found['Gross IRR']?.cell === 'Summary!C7', + `index-only label resolved (got ${r.found['Gross IRR']?.cell})`); + + // Inverse: with no _labels.json the GT scan cannot find a label absent from + // the GT — confirming the index, not a GT string, drove the match above. + const dirScan = makeDir({ gt, labels: null }); + const rScan = runManifestRefine(dirScan, { apply: false }); + assert(!rScan.found['Gross IRR'], + 'GT-scan fallback cannot resolve a label that is absent from the ground truth'); + + rmSync(dir, { recursive: true, force: true }); + rmSync(dirScan, { recursive: true, force: true }); +} + +// --------------------------------------------------------------------------- +console.log(''); +console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`); +process.exit(failed > 0 ? 1 : 0); From f52251be66b46d3b81a00641b2ecdb657f9a7f57 Mon Sep 17 00:00:00 2001 From: Eric Boothe Date: Thu, 28 May 2026 14:48:31 -0600 Subject: [PATCH 2/2] docs: refine label-index optimization (CHANGELOG/PLAN/ROADMAP/SKILL) CHANGELOG + PLAN entries for the _labels.json consumption + lazy numeric probes. ROADMAP: mark the pre-indexed label->cell item done for refine, with Tier B (parser-emitted row-values artifact) and the searchByLabel / init single-index follow-ups called out. SKILL: note refine is faster on big models (transparent). Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 42 ++++++++++++++++++++++++++++++++++++++++++ PLAN.md | 14 ++++++++++++++ ROADMAP.md | 18 +++++++++++++++--- skill/SKILL.md | 6 ++++++ 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6181758..3d31f8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,47 @@ # excel-to-engine — Changelog +## 2026-05-28 — refine consumes `_labels.json` + lazy numeric probes + +`ete manifest refine` rebuilt a full label+numeric index over the **entire** +ground truth on every run (`buildIndex`), even though it only ever inspects +numerics on a *matched label's own row*. On big models the bulk of that work +indexed giant **unlabeled** grids (e.g. a 190 MB PP&E depreciation schedule) +that the refiner never consults — pure waste. (Investigation also found refine +did **not** consume the parser's `_labels.json` at all, despite that index +existing since V4.) + +### What changed + +- **Labels now come from `chunked/_labels.json`** when the parser emitted it — + an O(labels) read instead of scanning every cell. Legacy engines without the + index fall back to a one-time GT scan (`buildLabelIndex`), so nothing breaks. +- **Numerics are resolved lazily, per matched row**, by probing that row's + columns on demand (`numericsForRow`, memoized) — instead of bucketing every + numeric in a multi-million-cell workbook up front. The giant unlabeled grids + are never touched. +- **Behavior-preserving:** the candidate ranking, dedup, value-range, and + summary/rollup/hint logic are untouched. The full manifest + ship-ready + suites stay green. + +### Impact + +The eliminated `buildIndex` pass scales with *total* cell count; the new probe +cost scales with *matched label rows* (a few dozen). On a synthetic giant-grid +ground truth the removed pass alone was ~1.4 s (1.4 M cells) / ~7.9 s (6.4 M +cells); end-to-end refine now finishes in less time than the old index build +took. The remaining floor is the unavoidable JSON parse of the ground truth — a +follow-up could lift that with a parser-emitted row-values artifact (see +ROADMAP), and the same lazy-numerics treatment could be extended to +`searchByLabel` (the `query` / `carry` path). + +### Tests + +- `tests/cli/test-refine-label-index.mjs` (14), wired into `npm test`: + correctness off `_labels.json`; **parity** between the index path and the + GT-scan fallback; lazy-probe far/gapped columns + value ranges; and a + **consumption proof** — a label present only in the index (not as a GT + string) is still resolved, which the fallback provably cannot do. + ## 2026-05-28 — Continuous integration (GitHub Actions) The test suite is now substantial (132 JS assertions across 7 suites, plus the diff --git a/PLAN.md b/PLAN.md index 29a66b4..8826a0a 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1,5 +1,19 @@ # excel-to-engine — Plan +## Status: refine label-index optimization — landed 2026-05-28 + +`ete manifest refine` now sources labels from the parser's `_labels.json` +(O(labels), no full GT scan) and resolves same-row numerics lazily by probing, +instead of bucketing every numeric in the workbook up front (`buildIndex`). The +giant unlabeled grids that dominate big models — the very thing that made refine +slow — are no longer touched. Behavior-preserving (rankings unchanged; suites +green). New `tests/cli/test-refine-label-index.mjs` (14) proves consumption + +parity. The remaining cost floor is the ground-truth JSON parse; lifting that +would need a parser-emitted row-values artifact (Tier B). The same lazy-numerics +treatment is still open for `searchByLabel` (the `query`/`carry` path), and the +per-command GT re-parse multiplier in `init` (generate → refine → doctor → maps +each reload the GT) remains a separate follow-up. + ## Status: Continuous integration — landed 2026-05-28 `.github/workflows/ci.yml` runs the full test matrix (Rust build + 11 unit diff --git a/ROADMAP.md b/ROADMAP.md index 4c1051f..d9c94df 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -76,9 +76,21 @@ when we next touch the monitor server or auth surface. ### Manifest Refinement (continuing) - Model-family templates — recognize a family by its sheet signature and pick known cells directly (summary tabs, promote tab, etc.). -- Pre-indexed label→cell map built once during parsing (the session log noted - `manifest refine` took 2.5 min CPU on a 200 MB ground truth; a pre-index - from the Rust parser would cut this 10–100×). +- Pre-indexed label→cell map. + - **Done (2026-05-28):** `ete manifest refine` now consumes the parser's + `chunked/_labels.json` for labels (it previously ignored it and rebuilt a + full label+numeric index over the whole GT) and resolves same-row numerics + lazily by probing — so it no longer indexes the giant unlabeled grids that + dominate big models. The removed `buildIndex` pass was ~7.9 s on a 6.4 M-cell + GT; the work skipped scales with total cell count. `test-refine-label-index`. + - **Still open (Tier B):** the remaining floor is the ground-truth JSON parse. + A parser-emitted *row-values* artifact (numerics for label-bearing rows + only) would let refine skip the GT entirely — a large win on giant-grid + models, ~GT-sized (no win) on dense-label models, so gate it on a + real-model size measurement first. + - **Still open:** apply the same lazy-numerics path to `searchByLabel` + (`query` / `carry`), and build the GT index *once* per `init` so + generate → refine → doctor → maps stop each re-parsing it. - Manifest migration tooling for model updates (vN → vN+1 shape diff). --- diff --git a/skill/SKILL.md b/skill/SKILL.md index 0d34a23..3a22e6b 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -108,6 +108,12 @@ Silently falls through to a normal parse if `chunked/_ground-truth.json` is missing — safe to default on when iterating. Turns the tighten-the-manifest loop from minutes to seconds. +The refine step inside that loop is also faster on big models: it reads labels +from the parser's `chunked/_labels.json` and probes only the matched rows for +values, instead of indexing every cell (it used to scan the whole ground truth, +including giant unlabeled grids it never consults). Transparent — same command, +same result. + **Default output is slim.** `ete init` drops the large debug/intermediate artifacts (`dependency-graph.json`, `_graph.json`, root `model-map.json`) once the dependency closures are baked into `named-outputs.json` / `named-inputs.json`.