ThomAub · ThomAub · Apr 19, 2026 · Apr 19, 2026 · Apr 22, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -83,3 +83,5 @@ uv run pytest ../../crates/tests/snapshots/ --force-regen -q
 - Pre-commit hooks via prek: `cargo fmt`, `cargo clippy`, `oxlint`, `ruff`
 - Snapshot tests live in `crates/tests/rust_snapshots/` (Rust) and `crates/tests/snapshots/` (Python)
 - Fixtures in `examples/data/`
+- External benchmark integrations live in `integrations/` (e.g.
+  `integrations/parsebench/` for the run-llama/ParseBench PARSE provider)
diff --git a/crates/officemd_pdf/src/pdf_inspector/extractor/interpreter.rs b/crates/officemd_pdf/src/pdf_inspector/extractor/interpreter.rs
@@ -27,12 +27,27 @@ pub(crate) struct ExtractionSink {
     pub(crate) rects: Vec<PdfRect>,
 }
 
+/// One entry on the q/Q graphics-state stack. Per PDF 32000-1 §8.4.2, the
+/// saved state includes the text state (Tc, Tw, Th, TL, Tf, Tfs, Tr, Trise),
+/// so we capture the font name + size alongside the CTM so that `Q` restores
+/// the font that was active before the matching `q` — otherwise a `/Fx Tf`
+/// inside a nested graphics block leaks out and misroutes later decoding
+/// through the wrong font's ToUnicode CMap.
+#[derive(Debug, Clone)]
+pub(crate) struct GraphicsStackEntry {
+    ctm: [f32; 6],
+    fill_is_white: bool,
+    text_rendering_mode: i32,
+    current_font: String,
+    current_font_size: f32,
+}
+
 #[derive(Debug)]
 pub(crate) struct GraphicsState {
     pub(crate) ctm: [f32; 6],
     fill_is_white: bool,
     text_rendering_mode: i32,
-    stack: Vec<([f32; 6], bool, i32)>,
+    stack: Vec<GraphicsStackEntry>,
 }
 
 impl GraphicsState {
@@ -583,17 +598,21 @@ where
         trace!("{} {:?}", op.operator, op.operands);
         match op.operator.as_str() {
             "q" => {
-                graphics_state.stack.push((
-                    graphics_state.ctm,
-                    graphics_state.fill_is_white,
-                    graphics_state.text_rendering_mode,
-                ));
+                graphics_state.stack.push(GraphicsStackEntry {
+                    ctm: graphics_state.ctm,
+                    fill_is_white: graphics_state.fill_is_white,
+                    text_rendering_mode: graphics_state.text_rendering_mode,
+                    current_font: text_state.current_font.clone(),
+                    current_font_size: text_state.current_font_size,
+                });
             }
             "Q" => {
-                if let Some((saved_ctm, saved_fill, saved_tr)) = graphics_state.stack.pop() {
-                    graphics_state.ctm = saved_ctm;
-                    graphics_state.fill_is_white = saved_fill;
-                    graphics_state.text_rendering_mode = saved_tr;
+                if let Some(saved) = graphics_state.stack.pop() {
+                    graphics_state.ctm = saved.ctm;
+                    graphics_state.fill_is_white = saved.fill_is_white;
+                    graphics_state.text_rendering_mode = saved.text_rendering_mode;
+                    text_state.current_font = saved.current_font;
+                    text_state.current_font_size = saved.current_font_size;
                 }
             }
             "cm" => {

diff --git a/crates/officemd_pdf/src/pdf_inspector/extractor/mod.rs b/crates/officemd_pdf/src/pdf_inspector/extractor/mod.rs
@@ -270,6 +270,7 @@ pub(crate) fn merge_text_items(items: Vec<TextItem>) -> Vec<TextItem> {
             let mut text = first.text.clone();
             let mut end_x = first.x + first.width;
             let x_gap_max = first.font_size * 0.75;
+            let mut prev = first;
 
             let mut j = i + 1;
             while j < group.len() {
@@ -281,7 +282,25 @@ pub(crate) fn merge_text_items(items: Vec<TextItem>) -> Vec<TextItem> {
                 if (next.font_size - first.font_size).abs() > first.font_size * 0.20 {
                     break;
                 }
-                let gap = next.x - end_x;
+                let raw_gap = next.x - end_x;
+                // Some CID fonts declare only a default width (DW) with no W array, so every
+                // glyph — even narrow punctuation like an en-dash — reports the same wide
+                // advance. That inflates `end_x` and can mask real whitespace between items
+                // placed via their own `Tm` operator. Only trust that diagnosis when the
+                // overlap is a substantial fraction of the reported width; tight letter
+                // kerning produces a tiny negative gap that we still want to treat as a
+                // no-space continuation.
+                let gap = if raw_gap < 0.0 && -raw_gap > prev.width * 0.25 {
+                    let prev_chars = prev.text.chars().count().max(1) as f32;
+                    let honest_end = prev.x + prev_chars * prev.font_size * 0.5;
+                    if next.x > honest_end {
+                        next.x - honest_end
+                    } else {
+                        raw_gap
+                    }
+                } else {
+                    raw_gap
+                };
                 if gap > x_gap_max {
                     break;
                 }
@@ -294,6 +313,7 @@ pub(crate) fn merge_text_items(items: Vec<TextItem>) -> Vec<TextItem> {
                 }
                 text.push_str(&next.text);
                 end_x = next.x + next.width;
+                prev = next;
                 j += 1;
             }
 

diff --git a/crates/tests/rust_snapshots/tests/snapshots/pdf__openxml_whitepaper_ir.snap b/crates/tests/rust_snapshots/tests/snapshots/pdf__openxml_whitepaper_ir.snap
diff --git a/crates/tests/rust_snapshots/tests/snapshots/pdf__openxml_whitepaper_markdown.snap b/crates/tests/rust_snapshots/tests/snapshots/pdf__openxml_whitepaper_markdown.snap
diff --git a/integrations/parsebench/README.md b/integrations/parsebench/README.md
@@ -0,0 +1,170 @@
+# OfficeMD ↔ ParseBench integration
+
+A drop-in [ParseBench][parsebench] `PARSE` provider that invokes the local
+OfficeMD CLI on a PDF and normalizes the JSON document into ParseBench's
+`ParseOutput`. Shipped with a helper script to materialize a non-OCR slice of
+a document corpus using OfficeMD's own classifier.
+
+The first benchmark pass targets **text-based, non-OCR PDFs** so results are
+comparable with ParseBench's local baselines (`pypdf_baseline`,
+`pymupdf_text`).
+
+[parsebench]: https://github.com/run-llama/ParseBench
+
+## Layout
+
+```
+integrations/parsebench/
+├── pyproject.toml
+├── src/parse_bench/
+│   └── inference/
+│       ├── providers/parse/officemd_local.py   # Provider (drop-in module path)
+│       └── pipelines/officemd_pipelines.py     # register_officemd_pipelines(register_fn)
+├── scripts/classify_non_ocr_pdfs.py            # Materialize the non-OCR slice
+└── tests/test_officemd_local.py                # Provider unit tests
+```
+
+The provider module lives under the `parse_bench.inference.providers.parse`
+package path so it can be imported as-is from a ParseBench checkout. The
+package is distributed as `parse-bench-officemd` and cohabits the
+`parse_bench` namespace via a Hatch wheel.
+
+## Installation
+
+Install into the same virtualenv as your ParseBench checkout. From the
+OfficeMD repo root:
+
+```sh
+uv pip install -e integrations/parsebench
+# or:
+pip install -e integrations/parsebench
+```
+
+Then patch the ParseBench pipeline registry to register the new pipeline.
+Edit `src/parse_bench/inference/pipelines/parse.py` (in the ParseBench
+checkout) to call `register_officemd_pipelines`:
+
+```python
+from parse_bench.inference.pipelines.officemd_pipelines import (
+    register_officemd_pipelines,
+)
+
+def register_parse_pipelines(register_fn):
+    # ...existing registrations...
+    register_officemd_pipelines(register_fn)
+```
+
+No upstream changes are required beyond this single call; the provider
+auto-registers via `@register_provider("officemd_local")` when the pipeline
+module is imported.
+
+## Running
+
+Point the provider at an OfficeMD checkout (either via env var or pipeline
+config) and run the pipeline:
+
+```sh
+export OFFICEMD_REPO_ROOT=/path/to/officemd
+uv run parse-bench pipelines                          # confirms officemd_local is listed
+uv run parse-bench run officemd_local --test --group text_content
+uv run parse-bench run officemd_local --test --group text_formatting
+```
+
+By default the provider invokes `cargo run --release -p officemd_cli --
+stream <pdf> --output-format json --pretty` from `repo_root`. To use a
+prebuilt binary instead:
+
+```python
+PipelineSpec(
+    pipeline_name="officemd_local_binary",
+    provider_name="officemd_local",
+    product_type=ProductType.PARSE,
+    config={
+        "cargo_run": False,
+        "binary": "/path/to/target/release/officemd",
+        "extra_args": ["--no-headers-footers"],
+    },
+)
+```
+
+### Provider config reference
+
+| Key | Type | Default | Description |
+|---|---|---|---|
+| `cargo_run` | bool | `True` | Invoke `cargo run` from `repo_root` |
+| `repo_root` | str | `$OFFICEMD_REPO_ROOT` | Absolute path to the workspace root (required when `cargo_run=True`) |
+| `cargo_profile` | str | `"release"` | `release`, `dev`, or a custom profile name |
+| `binary` | str | `None` | Prebuilt binary path (required when `cargo_run=False`) |
+| `extra_args` | list[str] | `[]` | Extra CLI flags appended after the input path |
+| `timeout_seconds` | float | `600` | Per-file subprocess timeout |
+
+## Normalization rules
+
+- `pdf.pages[].number` → `PageIR.page_index = number - 1`
+- `pdf.pages[].markdown` → `PageIR.markdown`
+- Document `markdown` is page markdown joined with a single blank line
+- `layout_pages` is intentionally left empty in v1; layout attribution is
+  not wired through OfficeMD yet
+- The full OfficeMD JSON document (including `pdf.diagnostics`) is preserved
+  in `raw_output["document"]` for downstream analysis
+
+## Slicing the non-OCR benchmark subset
+
+After the dataset documents are present, run the classifier to select PDFs
+that OfficeMD considers `TextBased` with no pages requiring OCR:
+
+```sh
+uv run integrations/parsebench/scripts/classify_non_ocr_pdfs.py \
+    --input-dir ~/parsebench/data/documents \
+    --report-jsonl non_ocr_report.jsonl \
+    --manifest non_ocr_manifest.txt
+```
+
+Outputs:
+
+- `non_ocr_report.jsonl` — one row per PDF with classification, confidence,
+  page count, `pages_needing_ocr`, and a `non_ocr` boolean. Keep this
+  alongside the benchmark run for later drill-down.
+- `non_ocr_manifest.txt` — the filtered list of qualifying PDFs; feed it to
+  ParseBench (or a small wrapper) to restrict the run rather than editing
+  the upstream dataset files.
+
+A PDF is selected when the classification is `TextBased` **and**
+`pages_needing_ocr` is empty, matching OfficeMD's own definition of
+"pure text-based".
+
+## Comparison workflow
+
+Run the same non-OCR slice through OfficeMD and the local baselines:
+
+```sh
+uv run parse-bench run officemd_local  --test --group text_content
+uv run parse-bench run pypdf_baseline  --test --group text_content
+uv run parse-bench run pymupdf_text    --test --group text_content
+```
+
+Repeat for `text_formatting` and `table`. `chart` is intentionally skipped
+in the first pass — OfficeMD emits text markdown but no chart-specific
+structured payload yet, so a chart comparison would degenerate into a text
+comparison. Revisit once chart metadata is part of the PDF JSON payload.
+
+## Testing
+
+```sh
+uv pip install -e integrations/parsebench[dev]
+uv run pytest integrations/parsebench/tests -q
+```
+
+The unit tests stub `subprocess.run` and do not require `cargo`, a real PDF,
+or a live OfficeMD checkout. They do require `parse_bench` to be installed
+in the environment; otherwise they are skipped via `pytest.importorskip`.
+
+## Assumptions and non-goals
+
+- Invocation uses `cargo run` by default so benchmark runs always exercise
+  the current tree. Switch to `binary` mode for stable runs.
+- ParseBench layout attribution is **not** wired up in v1; the goal is to
+  improve non-OCR parse quality, not overlay reconstruction.
+- Cleaner page-boundary or formatting semantics than the current CLI JSON
+  exposes should be added to the OfficeMD PDF payload, **not** papered over
+  with ParseBench-specific post-processing.
diff --git a/integrations/parsebench/pyproject.toml b/integrations/parsebench/pyproject.toml
@@ -0,0 +1,26 @@
+[project]
+name = "parse-bench-officemd"
+version = "0.1.0"
+description = "OfficeMD local provider for run-llama/ParseBench (PARSE task)"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+
+# `parse_bench` is assumed to be installed alongside this package (editable
+# install of the ParseBench checkout or a released wheel). It is intentionally
+# left out of `dependencies` so this package can be added to an existing
+# ParseBench environment without fighting version pins.
+dependencies = []
+
+[project.optional-dependencies]
+dev = ["pytest>=7"]
+
+[build-system]
+requires = ["hatchling>=1.18"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/parse_bench"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]