Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions crates/ethos-cli/tests/pdf_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,29 @@ fn successful_fixture_ids() -> Vec<String> {
.collect()
}

fn table_fixture_ids() -> Vec<String> {
fixture_manifest_entries()
.into_iter()
.filter(|entry| {
let subsets = entry["subsets"]
.as_array()
.expect("fixture subsets are an array");
subsets
.iter()
.any(|subset| subset.as_str() == Some("tables"))
&& !subsets
.iter()
.any(|subset| subset.as_str() == Some("failure"))
})
.map(|entry| {
entry["id"]
.as_str()
.expect("fixture id is a string")
.to_string()
})
.collect()
}

fn fixture_pdf() -> PathBuf {
fixture_pdf_by_id("simple-text")
}
Expand Down Expand Up @@ -350,6 +373,33 @@ fn doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_conf
}
}

#[test]
fn doc_parse_table_fixtures_match_table_goldens_when_pdfium_is_configured() {
if !pdfium_configured() {
eprintln!(
"skipping table fixture golden test: ETHOS_PDFIUM_LIBRARY_PATH is not configured"
);
return;
}

for fixture_id in table_fixture_ids() {
let fixture = fixture_pdf_by_id(&fixture_id);
let doc = parse_json(&[
"doc",
"parse",
fixture.to_str().unwrap(),
"--format",
"json",
]);
let tables = doc["payload"]["tables"].clone();
assert!(
tables.as_array().is_some_and(|tables| !tables.is_empty()),
"tables fixture {fixture_id} must emit at least one table candidate"
);
assert_or_accept_golden(fixture_dir_by_id(&fixture_id).join("tables.json"), &tables);
}
}

#[test]
fn parses_heading_fixture_and_exports_markdown_when_pdfium_is_configured() {
if !pdfium_configured() {
Expand Down
8 changes: 8 additions & 0 deletions fixtures/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ Successful parse fixtures also carry c14n stage goldens:
- `text.txt`: current alpha plain-text export rendered from `layout.json` element text order.
- `markdown.md`: current alpha Markdown export rendered from `layout.json`, including heading
prefixes and explicit list-item marker text for committed matching elements.
- `tables.json`: for fixtures tagged `tables`, the exact `payload.tables` artifact emitted by
`ethos doc parse --format json` under the pinned PDFium profile.

For successful fixtures, `validate_fixtures.py` also binds selected `fixture.json`
expectations to those committed goldens:
Expand All @@ -58,6 +60,10 @@ The text and Markdown export goldens are validated as exact UTF-8 bytes against
committed `layout.json` output. They are an internal Milestone B alpha guard for the current
trust-loop export path, not a broader document-conversion claim.

Table fixture goldens are validated as canonical JSON, checked against table/cell invariants,
and reference-checked against committed extraction/layout ids. They are an internal Milestone C
guard for current text-layer table candidate artifacts, not a broader table extraction claim.

Regenerate them only after reviewing parser/layout drift. First configure the pinned profile
artifact for your platform; for macOS arm64 this is:

Expand All @@ -69,6 +75,8 @@ ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \
successful_fixtures_match_extraction_and_layout_goldens_when_pdfium_is_configured -- --exact
ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \
doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_configured -- --exact
ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \
doc_parse_table_fixtures_match_table_goldens_when_pdfium_is_configured -- --exact
python3 fixtures/validate_fixtures.py
```

Expand Down
15 changes: 14 additions & 1 deletion fixtures/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"ligatures",
"lists",
"multi_column",
"rotation"
"rotation",
"tables"
],
"fixtures": [
{
Expand Down Expand Up @@ -141,6 +142,18 @@
"provenance": "Synthetic PDF generated by Ethos maintainers for WS-ENGINE parser smoke tests.",
"license": "CC0-1.0"
},
{
"id": "synthetic-table-regular-grid",
"file": "synthetic/table-regular-grid/document.pdf",
"sha256": "1ea9b32622e0bfc5b1465d9d78ad9eab8be31a7c8e760592ecc52c06cf0864ca",
"pages": 1,
"subsets": [
"born_digital",
"tables"
],
"provenance": "Synthetic PDF generated by Ethos maintainers for Milestone C regular-grid table candidate fixture coverage.",
"license": "CC0-1.0"
},
{
"id": "synthetic-two-columns",
"file": "synthetic/two-columns/document.pdf",
Expand Down
Binary file added fixtures/synthetic/table-regular-grid/document.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions fixtures/synthetic/table-regular-grid/extraction.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"pages":[{"height":79200,"id":"p0001","index":1,"rotation":0,"width":61200}],"regions":[],"spans":[{"bbox":[7291,6341,10351,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000001","origin_locator":{"first_origin":[7200,7200],"last_origin":[9733,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Name"},{"bbox":[18054,6326,21085,7213],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000002","origin_locator":{"first_origin":[18000,7200],"last_origin":[20467,7200],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Score"},{"bbox":[7200,8741,10218,9838],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000003","origin_locator":{"first_origin":[7200,9600],"last_origin":[9601,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Alpha"},{"bbox":[18131,8737,19277,9613],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000004","origin_locator":{"first_origin":[18000,9600],"last_origin":[18667,9600],"policy":"origin-run-locator-v1"},"page":"p0001","text":"10"},{"bbox":[7288,11141,9618,12013],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000005","origin_locator":{"first_origin":[7200,12000],"last_origin":[9001,12000],"policy":"origin-run-locator-v1"},"page":"p0001","text":"Beta"},{"bbox":[18131,11137,19271,12000],"font_id":"subst:liberation-sans-regular","font_size_q":1200,"id":"s000006","origin_locator":{"first_origin":[18000,12000],"last_origin":[18667,12000],"policy":"origin-run-locator-v1"},"page":"p0001","text":"12"}],"warnings":[]}
16 changes: 16 additions & 0 deletions fixtures/synthetic/table-regular-grid/fixture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"id": "synthetic-table-regular-grid",
"kind": "pdf",
"provenance": "Synthetic PDF generated by Ethos maintainers for Milestone C regular-grid table candidate fixture coverage.",
"license": "CC0-1.0",
"subsets": ["born_digital", "tables"],
"description": "One-page synthetic PDF with a 3x2 regular text grid for table candidate artifact validation.",
"document": "document.pdf",
"sha256": "1ea9b32622e0bfc5b1465d9d78ad9eab8be31a7c8e760592ecc52c06cf0864ca",
"pages": 1,
"expected_text": ["Name", "Alpha", "Beta", "Score", "10", "12"],
"expected_span_text": ["Name", "Score", "Alpha", "10", "Beta", "12"],
"expected_pages": 1,
"expected_elements": 6,
"expected_element_types": ["text_block", "text_block", "text_block", "text_block", "text_block", "text_block"]
}
1 change: 1 addition & 0 deletions fixtures/synthetic/table-regular-grid/layout.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"elements":[{"bbox":[7291,6341,10351,7213],"id":"e000001","page":"p0001","span_refs":["s000001"],"text":"Name","type":"text_block"},{"bbox":[7200,8741,10218,9838],"id":"e000002","page":"p0001","span_refs":["s000003"],"text":"Alpha","type":"text_block"},{"bbox":[7288,11141,9618,12013],"id":"e000003","page":"p0001","span_refs":["s000005"],"text":"Beta","type":"text_block"},{"bbox":[18054,6326,21085,7213],"id":"e000004","page":"p0001","span_refs":["s000002"],"text":"Score","type":"text_block"},{"bbox":[18131,8737,19277,9613],"id":"e000005","page":"p0001","span_refs":["s000004"],"text":"10","type":"text_block"},{"bbox":[18131,11137,19271,12000],"id":"e000006","page":"p0001","span_refs":["s000006"],"text":"12","type":"text_block"}],"warnings":[]}
11 changes: 11 additions & 0 deletions fixtures/synthetic/table-regular-grid/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Name

Alpha

Beta

Score

10

12
1 change: 1 addition & 0 deletions fixtures/synthetic/table-regular-grid/tables.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"bbox":[7200,6326,21085,12013],"cells":[{"bbox":[7291,6341,10351,7213],"col":0,"col_span":1,"row":0,"row_span":1,"span_refs":["s000001"],"text":"Name"},{"bbox":[18054,6326,21085,7213],"col":1,"col_span":1,"row":0,"row_span":1,"span_refs":["s000002"],"text":"Score"},{"bbox":[7200,8741,10218,9838],"col":0,"col_span":1,"row":1,"row_span":1,"span_refs":["s000003"],"text":"Alpha"},{"bbox":[18131,8737,19277,9613],"col":1,"col_span":1,"row":1,"row_span":1,"span_refs":["s000004"],"text":"10"},{"bbox":[7288,11141,9618,12013],"col":0,"col_span":1,"row":2,"row_span":1,"span_refs":["s000005"],"text":"Beta"},{"bbox":[18131,11137,19271,12000],"col":1,"col_span":1,"row":2,"row_span":1,"span_refs":["s000006"],"text":"12"}],"confidence":700,"header_cols":0,"header_rows":1,"id":"t0001","n_cols":2,"n_rows":3,"page_refs":["p0001"]}]
11 changes: 11 additions & 0 deletions fixtures/synthetic/table-regular-grid/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Name

Alpha

Beta

Score

10

12
118 changes: 118 additions & 0 deletions fixtures/validate_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,17 @@

ROOT = Path(__file__).resolve().parent
REPO_ROOT = ROOT.parent
sys.path.insert(0, str(REPO_ROOT / "schemas"))

from table_model_validation import diagnose_table_model # noqa: E402

MANIFEST = ROOT / "manifest.json"
ALLOWED_CATEGORIES = {"failure", "public", "security", "synthetic"}
MANIFEST_KEYS = {"manifest_version", "root", "subsets_declared", "fixtures"}
ENTRY_KEYS = {"id", "file", "sha256", "pages", "subsets", "provenance", "license"}
EXTRACTION_GOLDEN_KEYS = {"pages", "spans", "regions", "warnings"}
LAYOUT_GOLDEN_KEYS = {"elements", "warnings"}
TABLE_GOLDEN = "tables.json"
TEXT_EXPORT = "text.txt"
MARKDOWN_EXPORT = "markdown.md"
FOREIGN_MANIFEST_KEYS = {
Expand Down Expand Up @@ -260,6 +265,112 @@ def validate_export_goldens(fixture_dir: Path, layout) -> None:
)


def validate_table_goldens(fixture_dir: Path, metadata, extraction, layout) -> None:
ctx = str((fixture_dir / TABLE_GOLDEN).relative_to(ROOT))
subsets = metadata.get("subsets")
has_table_subset = isinstance(subsets, list) and "tables" in subsets
path = fixture_dir / TABLE_GOLDEN
if not has_table_subset:
if path.exists():
fail(f"{ctx} exists but fixture is not tagged tables")
return
if not path.is_file():
fail(f"{ctx} missing for tables fixture")
return

tables = load_json(path)
if tables is None:
return
if not isinstance(tables, list):
fail(f"{ctx} must be an array")
return
validate_c14n_scalar_contract(tables, ctx)
if path.read_bytes() != canonical_json_bytes(tables):
fail(f"{ctx} must be canonical JSON with one trailing newline")
if not tables:
fail(f"{ctx} must contain at least one table")

for diagnostic in diagnose_table_model({"tables": tables}, ctx):
fail(diagnostic)
validate_table_refs(ctx, tables, extraction, layout)


def validate_table_refs(ctx: str, tables, extraction, layout) -> None:
pages = extraction.get("pages") if isinstance(extraction, dict) else []
spans = extraction.get("spans") if isinstance(extraction, dict) else []
elements = layout.get("elements") if isinstance(layout, dict) else []
page_ids = {
page.get("id")
for page in pages
if isinstance(page, dict) and isinstance(page.get("id"), str)
}
span_ids = {
span.get("id")
for span in spans
if isinstance(span, dict) and isinstance(span.get("id"), str)
}
element_ids = {
element.get("id")
for element in elements
if isinstance(element, dict) and isinstance(element.get("id"), str)
}
warning_ids = set()
for warning in (extraction.get("warnings") if isinstance(extraction, dict) else []) or []:
if isinstance(warning, dict) and isinstance(warning.get("id"), str):
warning_ids.add(warning["id"])
for warning in (layout.get("warnings") if isinstance(layout, dict) else []) or []:
if isinstance(warning, dict) and isinstance(warning.get("id"), str):
warning_ids.add(warning["id"])

for table_index, table in enumerate(tables):
table_ctx = f"{ctx} tables[{table_index}]"
if not isinstance(table, dict):
fail(f"{table_ctx} must be an object")
continue
table_id = table.get("id", f"index-{table_index}")
for ref in string_ref_array(table.get("page_refs", []), f"{table_ctx}.page_refs"):
if ref not in page_ids:
fail(f"{table_ctx} references unknown page '{ref}'")
for ref in string_ref_array(
table.get("warning_refs", []), f"{table_ctx}.warning_refs"
):
if ref not in warning_ids:
fail(f"{table_ctx} references unknown warning '{ref}'")
cells = table.get("cells", [])
if not isinstance(cells, list):
continue
for cell_index, cell in enumerate(cells):
if not isinstance(cell, dict):
fail(f"{table_ctx} cells[{cell_index}] must be an object")
continue
cell_ctx = f"{table_ctx} cell[{cell_index}]"
span_refs = string_ref_array(cell.get("span_refs", []), f"{cell_ctx}.span_refs")
element_refs = string_ref_array(
cell.get("element_refs", []), f"{cell_ctx}.element_refs"
)
for ref in span_refs:
if ref not in span_ids:
fail(f"{cell_ctx} references unknown span '{ref}'")
for ref in element_refs:
if ref not in element_ids:
fail(f"{cell_ctx} references unknown element '{ref}'")
if not span_refs and not element_refs:
fail(f"{cell_ctx} in table {table_id} must cite span_refs or element_refs")


def string_ref_array(value, ctx: str) -> list[str]:
if not isinstance(value, list):
fail(f"{ctx} must be an array")
return []
refs = []
for index, item in enumerate(value):
if not isinstance(item, str) or not item:
fail(f"{ctx}[{index}] must be a non-empty string")
else:
refs.append(item)
return refs


def validate_projection_items(ctx: str, key: str, value, required: bool) -> None:
if not isinstance(value, list):
return
Expand Down Expand Up @@ -742,6 +853,12 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou
extraction_golden,
layout_golden,
)
validate_table_goldens(
fixture_dir,
metadata,
extraction_golden,
layout_golden,
)

if indexed_files != sorted(indexed_files):
fail("manifest fixture entries must be sorted by file")
Expand Down Expand Up @@ -771,6 +888,7 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou
ok("successful fixture goldens have valid stage metadata")
ok("successful fixture metadata expectations match committed stage goldens")
ok("successful fixture text and Markdown exports match committed layout goldens")
ok("tables fixture goldens match committed extraction and layout refs")
ok(f"foreign fixture manifests bind {foreign_package_count} package(s) to committed hashes")
ok(f"font-isolation manifest binds {font_isolation_fixture_count} PDF fixture(s)")

Expand Down
Loading