From 0a64304845fc655d00b705540914f16a6b3a58e5 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 12:12:51 +0530 Subject: [PATCH] Add table candidate fixture golden Signed-off-by: docushell-admin --- crates/ethos-cli/tests/pdf_parse.rs | 50 ++++++++ fixtures/README.md | 8 ++ fixtures/manifest.json | 15 ++- .../synthetic/table-regular-grid/document.pdf | Bin 0 -> 729 bytes .../table-regular-grid/extraction.json | 1 + .../synthetic/table-regular-grid/fixture.json | 16 +++ .../synthetic/table-regular-grid/layout.json | 1 + .../synthetic/table-regular-grid/markdown.md | 11 ++ .../synthetic/table-regular-grid/tables.json | 1 + .../synthetic/table-regular-grid/text.txt | 11 ++ fixtures/validate_fixtures.py | 118 ++++++++++++++++++ 11 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 fixtures/synthetic/table-regular-grid/document.pdf create mode 100644 fixtures/synthetic/table-regular-grid/extraction.json create mode 100644 fixtures/synthetic/table-regular-grid/fixture.json create mode 100644 fixtures/synthetic/table-regular-grid/layout.json create mode 100644 fixtures/synthetic/table-regular-grid/markdown.md create mode 100644 fixtures/synthetic/table-regular-grid/tables.json create mode 100644 fixtures/synthetic/table-regular-grid/text.txt diff --git a/crates/ethos-cli/tests/pdf_parse.rs b/crates/ethos-cli/tests/pdf_parse.rs index 550daae..fd1e15c 100644 --- a/crates/ethos-cli/tests/pdf_parse.rs +++ b/crates/ethos-cli/tests/pdf_parse.rs @@ -86,6 +86,29 @@ fn successful_fixture_ids() -> Vec { .collect() } +fn table_fixture_ids() -> Vec { + fixture_manifest_entries() + .into_iter() + .filter(|entry| { + let subsets = entry["subsets"] + .as_array() + .expect("fixture subsets are an array"); + subsets + .iter() + .any(|subset| subset.as_str() == Some("tables")) + && !subsets + .iter() + .any(|subset| subset.as_str() == Some("failure")) + }) + .map(|entry| { + entry["id"] + .as_str() + .expect("fixture id is a string") + .to_string() + }) + .collect() +} + fn fixture_pdf() -> PathBuf { fixture_pdf_by_id("simple-text") } @@ -350,6 +373,33 @@ fn doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_conf } } +#[test] +fn doc_parse_table_fixtures_match_table_goldens_when_pdfium_is_configured() { + if !pdfium_configured() { + eprintln!( + "skipping table fixture golden test: ETHOS_PDFIUM_LIBRARY_PATH is not configured" + ); + return; + } + + for fixture_id in table_fixture_ids() { + let fixture = fixture_pdf_by_id(&fixture_id); + let doc = parse_json(&[ + "doc", + "parse", + fixture.to_str().unwrap(), + "--format", + "json", + ]); + let tables = doc["payload"]["tables"].clone(); + assert!( + tables.as_array().is_some_and(|tables| !tables.is_empty()), + "tables fixture {fixture_id} must emit at least one table candidate" + ); + assert_or_accept_golden(fixture_dir_by_id(&fixture_id).join("tables.json"), &tables); + } +} + #[test] fn parses_heading_fixture_and_exports_markdown_when_pdfium_is_configured() { if !pdfium_configured() { diff --git a/fixtures/README.md b/fixtures/README.md index 4f86682..c23a7c1 100644 --- a/fixtures/README.md +++ b/fixtures/README.md @@ -39,6 +39,8 @@ Successful parse fixtures also carry c14n stage goldens: - `text.txt`: current alpha plain-text export rendered from `layout.json` element text order. - `markdown.md`: current alpha Markdown export rendered from `layout.json`, including heading prefixes and explicit list-item marker text for committed matching elements. +- `tables.json`: for fixtures tagged `tables`, the exact `payload.tables` artifact emitted by + `ethos doc parse --format json` under the pinned PDFium profile. For successful fixtures, `validate_fixtures.py` also binds selected `fixture.json` expectations to those committed goldens: @@ -58,6 +60,10 @@ The text and Markdown export goldens are validated as exact UTF-8 bytes against committed `layout.json` output. They are an internal Milestone B alpha guard for the current trust-loop export path, not a broader document-conversion claim. +Table fixture goldens are validated as canonical JSON, checked against table/cell invariants, +and reference-checked against committed extraction/layout ids. They are an internal Milestone C +guard for current text-layer table candidate artifacts, not a broader table extraction claim. + Regenerate them only after reviewing parser/layout drift. First configure the pinned profile artifact for your platform; for macOS arm64 this is: @@ -69,6 +75,8 @@ ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \ successful_fixtures_match_extraction_and_layout_goldens_when_pdfium_is_configured -- --exact ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \ doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_configured -- --exact +ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \ + doc_parse_table_fixtures_match_table_goldens_when_pdfium_is_configured -- --exact python3 fixtures/validate_fixtures.py ``` diff --git a/fixtures/manifest.json b/fixtures/manifest.json index 4e54dc0..c13b59b 100644 --- a/fixtures/manifest.json +++ b/fixtures/manifest.json @@ -10,7 +10,8 @@ "ligatures", "lists", "multi_column", - "rotation" + "rotation", + "tables" ], "fixtures": [ { @@ -141,6 +142,18 @@ "provenance": "Synthetic PDF generated by Ethos maintainers for WS-ENGINE parser smoke tests.", "license": "CC0-1.0" }, + { + "id": "synthetic-table-regular-grid", + "file": "synthetic/table-regular-grid/document.pdf", + "sha256": "1ea9b32622e0bfc5b1465d9d78ad9eab8be31a7c8e760592ecc52c06cf0864ca", + "pages": 1, + "subsets": [ + "born_digital", + "tables" + ], + "provenance": "Synthetic PDF generated by Ethos maintainers for Milestone C regular-grid table candidate fixture coverage.", + "license": "CC0-1.0" + }, { "id": "synthetic-two-columns", "file": "synthetic/two-columns/document.pdf", diff --git a/fixtures/synthetic/table-regular-grid/document.pdf b/fixtures/synthetic/table-regular-grid/document.pdf new file mode 100644 index 0000000000000000000000000000000000000000..35243a44e747d92840e93c6b8b2212d0ef30c382 GIT binary patch literal 729 zcmZuv%}&BV5WeqI%!Nb`y4^wxB!t8&jfsB*dLtedSfHeI$#ya5OM37P^hKOmN~P+i zP1^5g{&pI}oA^Rpwpioy>-~cX@IX)3tk(mcylxfnYiVVsm%xW|sS5B>FoJ%cseDdC z`mY-fI)0bV3z#%X{)|!dF1H{~jKuskQW~RBddR~c)I61u-owO0eJFfrcYTaEQia}` z85Ts9#u^>&LqLl%Vjg-hZEi7I0YQl#%O+JlyNkicyQy_;Q}Y6NBnx$5ZdLZ8Y&w%C zJy6tRlzWvg?FvLEAVFb`k{cE!jK>17>}26)>){OA*o5yvvVpTlxl!kktSe4*yowyp zw5hY`47;IoUS->r{9`?`!mCl_!Rk; LK5H}vPci!jk4?DP literal 0 HcmV?d00001 diff --git a/fixtures/synthetic/table-regular-grid/extraction.json b/fixtures/synthetic/table-regular-grid/extraction.json new file mode 100644 index 0000000..f5c4469 --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/extraction.json @@ -0,0 +1 @@ +{"pages":[{"height":79200,"id":"p0001","index":1,"rotation":0,"width":61200}],"regions":[],"spans":[{"bbox":[7291,6341,10351,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000001","origin_locator":{"first_origin":[7200,7200],"last_origin":[9733,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Name"},{"bbox":[18054,6326,21085,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000002","origin_locator":{"first_origin":[18000,7200],"last_origin":[20467,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Score"},{"bbox":[7200,8741,10218,9838],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000003","origin_locator":{"first_origin":[7200,9600],"last_origin":[9601,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Alpha"},{"bbox":[18131,8737,19277,9613],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000004","origin_locator":{"first_origin":[18000,9600],"last_origin":[18667,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"10"},{"bbox":[7288,11141,9618,12013],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000005","origin_locator":{"first_origin":[7200,12000],"last_origin":[9001,12000],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Beta"},{"bbox":[18131,11137,19271,12000],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000006","origin_locator":{"first_origin":[18000,12000],"last_origin":[18667,12000],"policy":"origin-run-locator-v1"},"page":"p0001","text":"12"}],"warnings":[]} diff --git a/fixtures/synthetic/table-regular-grid/fixture.json b/fixtures/synthetic/table-regular-grid/fixture.json new file mode 100644 index 0000000..0d2328a --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/fixture.json @@ -0,0 +1,16 @@ +{ + "id": "synthetic-table-regular-grid", + "kind": "pdf", + "provenance": "Synthetic PDF generated by Ethos maintainers for Milestone C regular-grid table candidate fixture coverage.", + "license": "CC0-1.0", + "subsets": ["born_digital", "tables"], + "description": "One-page synthetic PDF with a 3x2 regular text grid for table candidate artifact validation.", + "document": "document.pdf", + "sha256": "1ea9b32622e0bfc5b1465d9d78ad9eab8be31a7c8e760592ecc52c06cf0864ca", + "pages": 1, + "expected_text": ["Name", "Alpha", "Beta", "Score", "10", "12"], + "expected_span_text": ["Name", "Score", "Alpha", "10", "Beta", "12"], + "expected_pages": 1, + "expected_elements": 6, + "expected_element_types": ["text_block", "text_block", "text_block", "text_block", "text_block", "text_block"] +} diff --git a/fixtures/synthetic/table-regular-grid/layout.json b/fixtures/synthetic/table-regular-grid/layout.json new file mode 100644 index 0000000..cb1bc8d --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/layout.json @@ -0,0 +1 @@ +{"elements":[{"bbox":[7291,6341,10351,7213],"id":"e000001","page":"p0001","span_refs":["s000001"],"text":"Name","type":"text_block"},{"bbox":[7200,8741,10218,9838],"id":"e000002","page":"p0001","span_refs":["s000003"],"text":"Alpha","type":"text_block"},{"bbox":[7288,11141,9618,12013],"id":"e000003","page":"p0001","span_refs":["s000005"],"text":"Beta","type":"text_block"},{"bbox":[18054,6326,21085,7213],"id":"e000004","page":"p0001","span_refs":["s000002"],"text":"Score","type":"text_block"},{"bbox":[18131,8737,19277,9613],"id":"e000005","page":"p0001","span_refs":["s000004"],"text":"10","type":"text_block"},{"bbox":[18131,11137,19271,12000],"id":"e000006","page":"p0001","span_refs":["s000006"],"text":"12","type":"text_block"}],"warnings":[]} diff --git a/fixtures/synthetic/table-regular-grid/markdown.md b/fixtures/synthetic/table-regular-grid/markdown.md new file mode 100644 index 0000000..6f6c0d2 --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/markdown.md @@ -0,0 +1,11 @@ +Name + +Alpha + +Beta + +Score + +10 + +12 diff --git a/fixtures/synthetic/table-regular-grid/tables.json b/fixtures/synthetic/table-regular-grid/tables.json new file mode 100644 index 0000000..bedfa34 --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/tables.json @@ -0,0 +1 @@ +[{"bbox":[7200,6326,21085,12013],"cells":[{"bbox":[7291,6341,10351,7213],"col":0,"col_span":1,"row":0,"row_span":1,"span_refs":["s000001"],"text":"Name"},{"bbox":[18054,6326,21085,7213],"col":1,"col_span":1,"row":0,"row_span":1,"span_refs":["s000002"],"text":"Score"},{"bbox":[7200,8741,10218,9838],"col":0,"col_span":1,"row":1,"row_span":1,"span_refs":["s000003"],"text":"Alpha"},{"bbox":[18131,8737,19277,9613],"col":1,"col_span":1,"row":1,"row_span":1,"span_refs":["s000004"],"text":"10"},{"bbox":[7288,11141,9618,12013],"col":0,"col_span":1,"row":2,"row_span":1,"span_refs":["s000005"],"text":"Beta"},{"bbox":[18131,11137,19271,12000],"col":1,"col_span":1,"row":2,"row_span":1,"span_refs":["s000006"],"text":"12"}],"confidence":700,"header_cols":0,"header_rows":1,"id":"t0001","n_cols":2,"n_rows":3,"page_refs":["p0001"]}] diff --git a/fixtures/synthetic/table-regular-grid/text.txt b/fixtures/synthetic/table-regular-grid/text.txt new file mode 100644 index 0000000..6f6c0d2 --- /dev/null +++ b/fixtures/synthetic/table-regular-grid/text.txt @@ -0,0 +1,11 @@ +Name + +Alpha + +Beta + +Score + +10 + +12 diff --git a/fixtures/validate_fixtures.py b/fixtures/validate_fixtures.py index 669acf5..ef386e0 100644 --- a/fixtures/validate_fixtures.py +++ b/fixtures/validate_fixtures.py @@ -38,12 +38,17 @@ ROOT = Path(__file__).resolve().parent REPO_ROOT = ROOT.parent +sys.path.insert(0, str(REPO_ROOT / "schemas")) + +from table_model_validation import diagnose_table_model # noqa: E402 + MANIFEST = ROOT / "manifest.json" ALLOWED_CATEGORIES = {"failure", "public", "security", "synthetic"} MANIFEST_KEYS = {"manifest_version", "root", "subsets_declared", "fixtures"} ENTRY_KEYS = {"id", "file", "sha256", "pages", "subsets", "provenance", "license"} EXTRACTION_GOLDEN_KEYS = {"pages", "spans", "regions", "warnings"} LAYOUT_GOLDEN_KEYS = {"elements", "warnings"} +TABLE_GOLDEN = "tables.json" TEXT_EXPORT = "text.txt" MARKDOWN_EXPORT = "markdown.md" FOREIGN_MANIFEST_KEYS = { @@ -260,6 +265,112 @@ def validate_export_goldens(fixture_dir: Path, layout) -> None: ) +def validate_table_goldens(fixture_dir: Path, metadata, extraction, layout) -> None: + ctx = str((fixture_dir / TABLE_GOLDEN).relative_to(ROOT)) + subsets = metadata.get("subsets") + has_table_subset = isinstance(subsets, list) and "tables" in subsets + path = fixture_dir / TABLE_GOLDEN + if not has_table_subset: + if path.exists(): + fail(f"{ctx} exists but fixture is not tagged tables") + return + if not path.is_file(): + fail(f"{ctx} missing for tables fixture") + return + + tables = load_json(path) + if tables is None: + return + if not isinstance(tables, list): + fail(f"{ctx} must be an array") + return + validate_c14n_scalar_contract(tables, ctx) + if path.read_bytes() != canonical_json_bytes(tables): + fail(f"{ctx} must be canonical JSON with one trailing newline") + if not tables: + fail(f"{ctx} must contain at least one table") + + for diagnostic in diagnose_table_model({"tables": tables}, ctx): + fail(diagnostic) + validate_table_refs(ctx, tables, extraction, layout) + + +def validate_table_refs(ctx: str, tables, extraction, layout) -> None: + pages = extraction.get("pages") if isinstance(extraction, dict) else [] + spans = extraction.get("spans") if isinstance(extraction, dict) else [] + elements = layout.get("elements") if isinstance(layout, dict) else [] + page_ids = { + page.get("id") + for page in pages + if isinstance(page, dict) and isinstance(page.get("id"), str) + } + span_ids = { + span.get("id") + for span in spans + if isinstance(span, dict) and isinstance(span.get("id"), str) + } + element_ids = { + element.get("id") + for element in elements + if isinstance(element, dict) and isinstance(element.get("id"), str) + } + warning_ids = set() + for warning in (extraction.get("warnings") if isinstance(extraction, dict) else []) or []: + if isinstance(warning, dict) and isinstance(warning.get("id"), str): + warning_ids.add(warning["id"]) + for warning in (layout.get("warnings") if isinstance(layout, dict) else []) or []: + if isinstance(warning, dict) and isinstance(warning.get("id"), str): + warning_ids.add(warning["id"]) + + for table_index, table in enumerate(tables): + table_ctx = f"{ctx} tables[{table_index}]" + if not isinstance(table, dict): + fail(f"{table_ctx} must be an object") + continue + table_id = table.get("id", f"index-{table_index}") + for ref in string_ref_array(table.get("page_refs", []), f"{table_ctx}.page_refs"): + if ref not in page_ids: + fail(f"{table_ctx} references unknown page '{ref}'") + for ref in string_ref_array( + table.get("warning_refs", []), f"{table_ctx}.warning_refs" + ): + if ref not in warning_ids: + fail(f"{table_ctx} references unknown warning '{ref}'") + cells = table.get("cells", []) + if not isinstance(cells, list): + continue + for cell_index, cell in enumerate(cells): + if not isinstance(cell, dict): + fail(f"{table_ctx} cells[{cell_index}] must be an object") + continue + cell_ctx = f"{table_ctx} cell[{cell_index}]" + span_refs = string_ref_array(cell.get("span_refs", []), f"{cell_ctx}.span_refs") + element_refs = string_ref_array( + cell.get("element_refs", []), f"{cell_ctx}.element_refs" + ) + for ref in span_refs: + if ref not in span_ids: + fail(f"{cell_ctx} references unknown span '{ref}'") + for ref in element_refs: + if ref not in element_ids: + fail(f"{cell_ctx} references unknown element '{ref}'") + if not span_refs and not element_refs: + fail(f"{cell_ctx} in table {table_id} must cite span_refs or element_refs") + + +def string_ref_array(value, ctx: str) -> list[str]: + if not isinstance(value, list): + fail(f"{ctx} must be an array") + return [] + refs = [] + for index, item in enumerate(value): + if not isinstance(item, str) or not item: + fail(f"{ctx}[{index}] must be a non-empty string") + else: + refs.append(item) + return refs + + def validate_projection_items(ctx: str, key: str, value, required: bool) -> None: if not isinstance(value, list): return @@ -742,6 +853,12 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou extraction_golden, layout_golden, ) + validate_table_goldens( + fixture_dir, + metadata, + extraction_golden, + layout_golden, + ) if indexed_files != sorted(indexed_files): fail("manifest fixture entries must be sorted by file") @@ -771,6 +888,7 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou ok("successful fixture goldens have valid stage metadata") ok("successful fixture metadata expectations match committed stage goldens") ok("successful fixture text and Markdown exports match committed layout goldens") + ok("tables fixture goldens match committed extraction and layout refs") ok(f"foreign fixture manifests bind {foreign_package_count} package(s) to committed hashes") ok(f"font-isolation manifest binds {font_isolation_fixture_count} PDF fixture(s)")