diff --git a/crates/ethos-cli/tests/pdf_parse.rs b/crates/ethos-cli/tests/pdf_parse.rs index 0738521..550daae 100644 --- a/crates/ethos-cli/tests/pdf_parse.rs +++ b/crates/ethos-cli/tests/pdf_parse.rs @@ -98,6 +98,10 @@ fn heading_export_fixture_pdf() -> PathBuf { fixture_pdf_by_id("synthetic-heading-export") } +fn list_items_fixture_pdf() -> PathBuf { + fixture_pdf_by_id("synthetic-list-items") +} + fn two_column_fixture_pdf() -> PathBuf { fixture_pdf_by_id("synthetic-two-columns") } @@ -391,6 +395,50 @@ fn parses_heading_fixture_and_exports_markdown_when_pdfium_is_configured() { ); } +#[test] +fn parses_flat_list_items_and_exports_markdown_when_pdfium_is_configured() { + if !pdfium_configured() { + eprintln!( + "skipping list item export fixture test: ETHOS_PDFIUM_LIBRARY_PATH is not configured" + ); + return; + } + + let fixture = list_items_fixture_pdf(); + let doc = parse_json(&[ + "doc", + "parse", + fixture.to_str().unwrap(), + "--format", + "json", + ]); + let elements = doc["payload"]["elements"].as_array().unwrap(); + assert_eq!(elements.len(), 2); + assert_eq!(elements[0]["type"], "list_item"); + assert_eq!(elements[0]["text"], "- Verify cited evidence"); + assert_eq!(elements[1]["type"], "list_item"); + assert_eq!(elements[1]["text"], "2. Keep explicit"); + + let output = run_ethos(&[ + "doc", + "parse", + fixture.to_str().unwrap(), + "--format", + "markdown", + ]); + assert!( + output.status.success(), + "ethos doc parse --format markdown failed for list item fixture\nstatus: {:?}\nstderr:\n{}\nstdout:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr), + String::from_utf8_lossy(&output.stdout) + ); + assert_eq!( + String::from_utf8(output.stdout).expect("markdown stdout is UTF-8"), + "- Verify cited evidence\n\n2. Keep explicit\n" + ); +} + #[cfg(debug_assertions)] #[test] fn doc_parse_timeout_kills_pdfium_worker() { diff --git a/crates/ethos-layout/src/lib.rs b/crates/ethos-layout/src/lib.rs index 16ed07c..1f7e30d 100644 --- a/crates/ethos-layout/src/lib.rs +++ b/crates/ethos-layout/src/lib.rs @@ -278,7 +278,19 @@ fn layout_column_lines<'a>( let mut line_iter = lines.into_iter().peekable(); while let Some(line) = line_iter.next() { - if let Some(signal) = heading_signal(&line, body_font_size_q) { + if is_list_item_line(&line) { + flush_text_lines(&mut text_lines, page_id, next_element, elements)?; + let list_item_spans = line_spans(&line); + elements.push(build_element( + *next_element, + page_id, + &list_item_spans, + ElementType::ListItem, + None, + None, + )?); + *next_element += 1; + } else if let Some(signal) = heading_signal(&line, body_font_size_q) { flush_text_lines(&mut text_lines, page_id, next_element, elements)?; let level = heading_level(signal.size_q, heading_sizes); let mut confidence = signal.confidence; @@ -345,6 +357,30 @@ fn flush_text_lines<'a>( Ok(()) } +fn is_list_item_line(line: &Line<'_>) -> bool { + let text = line_text(line); + let trimmed = text.trim_start(); + has_unordered_list_marker(trimmed) || has_ordered_list_marker(trimmed) +} + +fn has_unordered_list_marker(text: &str) -> bool { + text.strip_prefix("- ") + .or_else(|| text.strip_prefix("* ")) + .is_some_and(|rest| !rest.trim().is_empty()) +} + +fn has_ordered_list_marker(text: &str) -> bool { + let Some((marker, rest)) = text.split_once(' ') else { + return false; + }; + if rest.trim().is_empty() || marker.len() < 2 || !marker.ends_with('.') { + return false; + } + marker[..marker.len() - 1] + .chars() + .all(|ch| ch.is_ascii_digit()) +} + fn heading_signal(line: &Line<'_>, body_font_size_q: Option) -> Option { let body_size = body_font_size_q?; let line_size = line_font_size_q(line)?; @@ -693,6 +729,111 @@ mod tests { ); } + #[test] + fn classifies_flat_marker_lines_as_list_items() { + let extraction = extraction(vec![ + span("s000001", "p0001", QRect::new(0, 0, 300, 500).unwrap(), "-"), + span( + "s000002", + "p0001", + QRect::new(450, 0, 1_200, 500).unwrap(), + "Verify", + ), + span( + "s000003", + "p0001", + QRect::new(1_350, 0, 2_000, 500).unwrap(), + "inputs", + ), + span( + "s000004", + "p0001", + QRect::new(0, 900, 350, 1_400).unwrap(), + "2.", + ), + span( + "s000005", + "p0001", + QRect::new(500, 900, 1_300, 1_400).unwrap(), + "Record", + ), + span( + "s000006", + "p0001", + QRect::new(1_450, 900, 2_200, 1_400).unwrap(), + "result", + ), + ]); + + let output = BasicLayoutEngine.layout(&extraction).unwrap(); + + assert_eq!(output.elements.len(), 2); + assert!(output + .elements + .iter() + .all(|element| element.element_type == ElementType::ListItem)); + assert_eq!(output.elements[0].text.as_deref(), Some("- Verify inputs")); + assert_eq!(output.elements[1].text.as_deref(), Some("2. Record result")); + } + + #[test] + fn list_items_split_surrounding_paragraph_text() { + let extraction = extraction(vec![ + span( + "s000001", + "p0001", + QRect::new(0, 0, 700, 500).unwrap(), + "Intro", + ), + span( + "s000002", + "p0001", + QRect::new(850, 0, 1_200, 500).unwrap(), + "text", + ), + span( + "s000003", + "p0001", + QRect::new(0, 900, 300, 1_400).unwrap(), + "-", + ), + span( + "s000004", + "p0001", + QRect::new(450, 900, 1_200, 1_400).unwrap(), + "Check", + ), + span( + "s000005", + "p0001", + QRect::new(1_350, 900, 2_000, 1_400).unwrap(), + "claim", + ), + span( + "s000006", + "p0001", + QRect::new(0, 1_800, 700, 2_300).unwrap(), + "Outro", + ), + span( + "s000007", + "p0001", + QRect::new(850, 1_800, 1_200, 2_300).unwrap(), + "text", + ), + ]); + + let output = BasicLayoutEngine.layout(&extraction).unwrap(); + + assert_eq!(output.elements.len(), 3); + assert_eq!(output.elements[0].element_type, ElementType::TextBlock); + assert_eq!(output.elements[0].text.as_deref(), Some("Intro text")); + assert_eq!(output.elements[1].element_type, ElementType::ListItem); + assert_eq!(output.elements[1].text.as_deref(), Some("- Check claim")); + assert_eq!(output.elements[2].element_type, ElementType::TextBlock); + assert_eq!(output.elements[2].text.as_deref(), Some("Outro text")); + } + #[test] fn classifies_large_line_as_heading_before_paragraph_merge() { let extraction = extraction(vec![ diff --git a/docs/execution-status.md b/docs/execution-status.md index 67de83b..69c5219 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -14,14 +14,14 @@ The committed implementation now includes: - A pinned Phase 1 PDFium profile in `docs/pdfium-profile.md` and `profiles/ethos-deterministic-v1.json`: `chromium/7881`, V8/XFA disabled, platform artifact hashes, runtime library hashes, and provenance are recorded. - Runtime checks that reject missing or mismatched PDFium versions, release artifacts, and extracted libraries with stable errors before dynamic loading. - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. -- Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading elements, and simple column reading order for the current born-digital fixtures. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, and export cases fail closed on drift. +- Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading and flat list-item elements, and simple column reading order for the current born-digital fixtures. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, list-item, and export cases fail closed on drift. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. - `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. - `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms. -Still absent or not claimable: public benchmark reports, public competitor-comparison claims, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and claim-audit approval for any public result wording. +Still absent or not claimable: public benchmark reports, public competitor-comparison claims, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics beyond current fixture-backed alpha paths, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and claim-audit approval for any public result wording. ## Human / External Blockers @@ -49,7 +49,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | PDFium Phase 1 profile | Landed: pinned profile, V8/XFA-disabled state, platform hashes, runtime library hashes, and provenance are recorded | Phase 2 project-maintained builds still block Public Beta | | PDFium loader/runtime checks | Landed: missing/mismatched version, artifact, and runtime library hashes fail deterministically | Release packaging and operator setup path still need hardening | | Real PDF backend | Landed for simple born-digital PDFs: page count, quantized spans, worker execution, timeout, page filtering, and fingerprint path exist | Wider corpus coverage, failure fixtures, memory-limit behavior, quirk log, and Gate Zero run are still missing | -| Layout groundwork | Landed: basic paragraph text blocks, fixture-backed alpha heading elements, simple column reading order over quantized spans, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, lists, rotation/quirk handling, and confidence policy remain future work | +| Layout groundwork | Landed: basic paragraph text blocks, fixture-backed alpha heading and flat list-item elements, simple column reading order over quantized spans, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, nested/richer list and heading semantics, rotation/quirk handling, and confidence policy remain future work | | Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs | Bundled fallback asset hashing and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | | Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: real OpenDataLoader table-cell grounding, additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | diff --git a/fixtures/README.md b/fixtures/README.md index 05f52a1..04cdb3a 100644 --- a/fixtures/README.md +++ b/fixtures/README.md @@ -38,7 +38,7 @@ Successful parse fixtures also carry c14n stage goldens: - `layout.json`: `ethos_core::traits::LayoutOutput` after deterministic layout grouping. - `text.txt`: current alpha plain-text export rendered from `layout.json` element text order. - `markdown.md`: current alpha Markdown export rendered from `layout.json`, including heading - prefixes for committed heading elements. + prefixes and explicit list-item marker text for committed matching elements. For successful fixtures, `validate_fixtures.py` also binds selected `fixture.json` expectations to those committed goldens: diff --git a/fixtures/manifest.json b/fixtures/manifest.json index e0a4b12..4e54dc0 100644 --- a/fixtures/manifest.json +++ b/fixtures/manifest.json @@ -8,6 +8,7 @@ "headings", "hyphenation", "ligatures", + "lists", "multi_column", "rotation" ], @@ -105,6 +106,18 @@ "provenance": "Synthetic PDF generated by Ethos maintainers for WS-ENGINE embedded-font fi-ligature extraction tests. The document embeds a maintainer-authored Type3 subset font with a ToUnicode map for the fi ligature.", "license": "CC0-1.0" }, + { + "id": "synthetic-list-items", + "file": "synthetic/list-items/document.pdf", + "sha256": "475e22fcc5d71e24d90204e0b06d82ca5620eebef7ec88aaad29378af9de4f1a", + "pages": 1, + "subsets": [ + "born_digital", + "lists" + ], + "provenance": "Synthetic PDF generated by Ethos maintainers for Milestone B alpha list-item fixture coverage.", + "license": "CC0-1.0" + }, { "id": "synthetic-rotation-90", "file": "synthetic/rotation-90/document.pdf", diff --git a/fixtures/synthetic/list-items/document.pdf b/fixtures/synthetic/list-items/document.pdf new file mode 100644 index 0000000..0c55ac1 Binary files /dev/null and b/fixtures/synthetic/list-items/document.pdf differ diff --git a/fixtures/synthetic/list-items/extraction.json b/fixtures/synthetic/list-items/extraction.json new file mode 100644 index 0000000..a13a6d5 --- /dev/null +++ b/fixtures/synthetic/list-items/extraction.json @@ -0,0 +1 @@ +{"pages":[{"height":79200,"id":"p0001","index":1,"rotation":0,"width":61200}],"regions":[],"spans":[{"bbox":[7238,6836,7562,6942],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000001","origin_locator":{"first_origin":[7200,7200],"last_origin":[7200,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"-"},{"bbox":[7938,6326,10990,7451],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000002","origin_locator":{"first_origin":[7933,7200],"last_origin":[10400,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Verify"},{"bbox":[11381,6341,13782,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000003","origin_locator":{"first_origin":[11334,7200],"last_origin":[13201,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"cited"},{"bbox":[14246,6341,18955,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000004","origin_locator":{"first_origin":[14202,7200],"last_origin":[18337,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"evidence"},{"bbox":[7235,8737,8096,9600],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000005","origin_locator":{"first_origin":[7200,9600],"last_origin":[7867,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"2."},{"bbox":[8622,8741,11288,9838],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000006","origin_locator":{"first_origin":[8534,9600],"last_origin":[10669,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Keep"},{"bbox":[11714,8741,15329,9838],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000007","origin_locator":{"first_origin":[11670,9600],"last_origin":[15004,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"explicit"}],"warnings":[]} diff --git a/fixtures/synthetic/list-items/fixture.json b/fixtures/synthetic/list-items/fixture.json new file mode 100644 index 0000000..d9fe7b5 --- /dev/null +++ b/fixtures/synthetic/list-items/fixture.json @@ -0,0 +1,16 @@ +{ + "id": "synthetic-list-items", + "kind": "pdf", + "provenance": "Synthetic PDF generated by Ethos maintainers for Milestone B alpha list-item fixture coverage.", + "license": "CC0-1.0", + "subsets": ["born_digital", "lists"], + "description": "One-page synthetic PDF with two flat marker-prefixed list item lines for alpha list-item layout and export coverage.", + "document": "document.pdf", + "sha256": "475e22fcc5d71e24d90204e0b06d82ca5620eebef7ec88aaad29378af9de4f1a", + "pages": 1, + "expected_text": ["- Verify cited evidence", "2. Keep explicit"], + "expected_span_text": ["-", "Verify", "cited", "evidence", "2.", "Keep", "explicit"], + "expected_pages": 1, + "expected_elements": 2, + "expected_element_types": ["list_item", "list_item"] +} diff --git a/fixtures/synthetic/list-items/layout.json b/fixtures/synthetic/list-items/layout.json new file mode 100644 index 0000000..4bc166c --- /dev/null +++ b/fixtures/synthetic/list-items/layout.json @@ -0,0 +1 @@ +{"elements":[{"bbox":[7238,6326,18955,7451],"id":"e000001","page":"p0001","span_refs":["s000001","s000002","s000003","s000004"],"text":"- Verify cited evidence","type":"list_item"},{"bbox":[7235,8737,15329,9838],"id":"e000002","page":"p0001","span_refs":["s000005","s000006","s000007"],"text":"2. Keep explicit","type":"list_item"}],"warnings":[]} diff --git a/fixtures/synthetic/list-items/markdown.md b/fixtures/synthetic/list-items/markdown.md new file mode 100644 index 0000000..816dd1b --- /dev/null +++ b/fixtures/synthetic/list-items/markdown.md @@ -0,0 +1,3 @@ +- Verify cited evidence + +2. Keep explicit diff --git a/fixtures/synthetic/list-items/text.txt b/fixtures/synthetic/list-items/text.txt new file mode 100644 index 0000000..816dd1b --- /dev/null +++ b/fixtures/synthetic/list-items/text.txt @@ -0,0 +1,3 @@ +- Verify cited evidence + +2. Keep explicit