Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions crates/ethos-cli/tests/pdf_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,39 @@ fn assert_or_accept_golden(path: PathBuf, value: &Value) {
);
}

fn assert_stdout_matches_fixture_file(
output: Output,
expected_path: PathBuf,
fixture_id: &str,
format: &str,
) {
assert!(
output.status.success(),
"ethos doc parse --format {format} failed for {fixture_id}\nstatus: {:?}\nstderr:\n{}\nstdout:\n{}",
output.status.code(),
String::from_utf8_lossy(&output.stderr),
String::from_utf8_lossy(&output.stdout)
);
assert!(
output.stderr.is_empty(),
"ethos doc parse --format {format} wrote stderr for {fixture_id}:\n{}",
String::from_utf8_lossy(&output.stderr)
);
if std::env::var("ETHOS_ACCEPT_GOLDENS").as_deref() == Ok("1") {
std::fs::write(&expected_path, &output.stdout)
.unwrap_or_else(|err| panic!("{} can be written: {err}", expected_path.display()));
return;
}
let expected = std::fs::read(&expected_path)
.unwrap_or_else(|err| panic!("{} is readable: {err}", expected_path.display()));
assert_eq!(
output.stdout,
expected,
"stdout for {fixture_id} --format {format} must match {}",
expected_path.display()
);
}

#[test]
fn successful_fixtures_match_extraction_and_layout_goldens_when_pdfium_is_configured() {
if !pdfium_configured() {
Expand Down Expand Up @@ -281,6 +314,34 @@ fn successful_fixtures_match_extraction_and_layout_goldens_when_pdfium_is_config
}
}

#[test]
fn doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_configured() {
if !pdfium_configured() {
eprintln!("skipping text/Markdown export golden test: ETHOS_PDFIUM_LIBRARY_PATH is not configured");
return;
}

for fixture_id in successful_fixture_ids() {
let fixture = fixture_pdf_by_id(&fixture_id);
let fixture_dir = fixture_dir_by_id(&fixture_id);
for (format, golden) in [("text", "text.txt"), ("markdown", "markdown.md")] {
let output = run_ethos(&[
"doc",
"parse",
fixture.to_str().unwrap(),
"--format",
format,
]);
assert_stdout_matches_fixture_file(
output,
fixture_dir.join(golden),
&fixture_id,
format,
);
}
}
}

#[cfg(debug_assertions)]
#[test]
fn doc_parse_timeout_kills_pdfium_worker() {
Expand Down
4 changes: 2 additions & 2 deletions docs/execution-status.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The committed implementation now includes:
- A pinned Phase 1 PDFium profile in `docs/pdfium-profile.md` and `profiles/ethos-deterministic-v1.json`: `chromium/7881`, V8/XFA disabled, platform artifact hashes, runtime library hashes, and provenance are recorded.
- Runtime checks that reject missing or mismatched PDFium versions, release artifacts, and extracted libraries with stable errors before dynamic loading.
- `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering.
- Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens so current read-order and element-type cases fail closed on drift.
- Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, and export cases fail closed on drift.
- Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema.
- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths.
- Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded.
Expand Down Expand Up @@ -49,7 +49,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so
| PDFium Phase 1 profile | Landed: pinned profile, V8/XFA-disabled state, platform hashes, runtime library hashes, and provenance are recorded | Phase 2 project-maintained builds still block Public Beta |
| PDFium loader/runtime checks | Landed: missing/mismatched version, artifact, and runtime library hashes fail deterministically | Release packaging and operator setup path still need hardening |
| Real PDF backend | Landed for simple born-digital PDFs: page count, quantized spans, worker execution, timeout, page filtering, and fingerprint path exist | Wider corpus coverage, failure fixtures, memory-limit behavior, quirk log, and Gate Zero run are still missing |
| Layout groundwork | Landed: basic paragraph text blocks, simple column reading order over quantized spans, and fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations | Tables, headings, lists, rotation/quirk handling, and confidence policy remain future work |
| Layout groundwork | Landed: basic paragraph text blocks, simple column reading order over quantized spans, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, headings, lists, rotation/quirk handling, and confidence policy remain future work |
| Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs | Bundled fallback asset hashing and broader font/CID validation remain open |
| Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review |
| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: real OpenDataLoader table-cell grounding, additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding |
Expand Down
11 changes: 11 additions & 0 deletions fixtures/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ Successful parse fixtures also carry c14n stage goldens:

- `extraction.json`: `ethos_core::traits::Extraction` after the PDF backend boundary.
- `layout.json`: `ethos_core::traits::LayoutOutput` after deterministic layout grouping.
- `text.txt`: current alpha plain-text export rendered from `layout.json` element text order.
- `markdown.md`: current alpha Markdown export rendered from `layout.json`. The current
committed synthetic fixture set contains text blocks, so these files mirror the same block
order as `text.txt`.

For successful fixtures, `validate_fixtures.py` also binds selected `fixture.json`
expectations to those committed goldens:
Expand All @@ -47,6 +51,10 @@ expectations to those committed goldens:
- `expected_text`: exact `layout.json` element text order. Use a string for a single
layout element and a string array when reading order spans multiple elements.

The text and Markdown export goldens are validated as exact UTF-8 bytes against the
committed `layout.json` output. They are an internal Milestone B alpha guard for the current
trust-loop export path, not a broader document-conversion claim.

Regenerate them only after reviewing parser/layout drift. First configure the pinned profile
artifact for your platform; for macOS arm64 this is:

Expand All @@ -56,6 +64,9 @@ export ETHOS_PDFIUM_VERSION=chromium/7881
export ETHOS_PDFIUM_ARTIFACT_PATH=/tmp/ethos-pdfium-mac-arm64.tgz
ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \
successful_fixtures_match_extraction_and_layout_goldens_when_pdfium_is_configured -- --exact
ETHOS_ACCEPT_GOLDENS=1 cargo test --locked --test pdf_parse \
doc_parse_text_and_markdown_exports_match_fixture_goldens_when_pdfium_is_configured -- --exact
python3 fixtures/validate_fixtures.py
```

Use the matching artifact name and runtime library path from
Expand Down
1 change: 1 addition & 0 deletions fixtures/synthetic/hyphenated-line-break/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hyphen ated
1 change: 1 addition & 0 deletions fixtures/synthetic/hyphenated-line-break/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hyphen ated
1 change: 1 addition & 0 deletions fixtures/synthetic/ligature-fi-embedded-font/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
office file
1 change: 1 addition & 0 deletions fixtures/synthetic/ligature-fi-embedded-font/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
office file
1 change: 1 addition & 0 deletions fixtures/synthetic/rotation-90/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Rotate Ninety
1 change: 1 addition & 0 deletions fixtures/synthetic/rotation-90/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Rotate Ninety
1 change: 1 addition & 0 deletions fixtures/synthetic/simple-text/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello Ethos
1 change: 1 addition & 0 deletions fixtures/synthetic/simple-text/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello Ethos
3 changes: 3 additions & 0 deletions fixtures/synthetic/two-columns/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Left top Left bottom

Right top Right bottom
3 changes: 3 additions & 0 deletions fixtures/synthetic/two-columns/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Left top Left bottom

Right top Right bottom
1 change: 1 addition & 0 deletions fixtures/synthetic/two-lines/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
First line Second line
1 change: 1 addition & 0 deletions fixtures/synthetic/two-lines/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
First line Second line
71 changes: 71 additions & 0 deletions fixtures/validate_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
ENTRY_KEYS = {"id", "file", "sha256", "pages", "subsets", "provenance", "license"}
EXTRACTION_GOLDEN_KEYS = {"pages", "spans", "regions", "warnings"}
LAYOUT_GOLDEN_KEYS = {"elements", "warnings"}
TEXT_EXPORT = "text.txt"
MARKDOWN_EXPORT = "markdown.md"
FOREIGN_MANIFEST_KEYS = {
"parser",
"version",
Expand Down Expand Up @@ -163,6 +165,72 @@ def validate_golden_file(path: Path, stage: str, keys: set[str]):
return golden


def render_text_export(layout):
elements = layout.get("elements") if isinstance(layout, dict) else None
if not isinstance(elements, list):
return None
text = "\n\n".join(
element["text"]
for element in elements
if isinstance(element, dict) and isinstance(element.get("text"), str)
)
return f"{text}\n".encode("utf-8")


def render_markdown_export(layout, ctx: str):
elements = layout.get("elements") if isinstance(layout, dict) else None
if not isinstance(elements, list):
return None

blocks = []
for index, element in enumerate(elements):
if not isinstance(element, dict) or not isinstance(element.get("text"), str):
continue
text = element["text"]
if element.get("type") == "heading":
level = element.get("heading_level", 1)
if not isinstance(level, int):
fail(f"{ctx} elements[{index}].heading_level must be an integer")
level = 1
level = min(max(level, 1), 6)
blocks.append(f"{'#' * level} {text}")
else:
blocks.append(text)
markdown = "\n\n".join(blocks)
return f"{markdown}\n".encode("utf-8")


def validate_export_file(path: Path, expected, label: str) -> None:
if expected is None:
return
rel = path.relative_to(ROOT)
if not path.is_file():
fail(f"{rel} missing for successful fixture")
return
actual = path.read_bytes()
try:
actual.decode("utf-8")
except UnicodeDecodeError as exc:
fail(f"{rel} must be UTF-8 text: {exc}")
return
if actual != expected:
fail(f"{rel} must match {label} rendered from committed layout.json")


def validate_export_goldens(fixture_dir: Path, layout) -> None:
ctx = str((fixture_dir / "layout.json").relative_to(ROOT))
validate_export_file(
fixture_dir / TEXT_EXPORT,
render_text_export(layout),
"text export",
)
validate_export_file(
fixture_dir / MARKDOWN_EXPORT,
render_markdown_export(layout, ctx),
"Markdown export",
)


def validate_projection_items(ctx: str, key: str, value, required: bool) -> None:
if not isinstance(value, list):
return
Expand Down Expand Up @@ -467,6 +535,8 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou
"layout",
LAYOUT_GOLDEN_KEYS,
)
if layout_golden is not None:
validate_export_goldens(fixture_dir, layout_golden)
if extraction_golden is not None and layout_golden is not None:
validate_stage_expectations(
metadata_path,
Expand Down Expand Up @@ -501,6 +571,7 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou
ok("fixture manifest has no missing or extra fixture documents")
ok("successful fixture goldens have valid stage metadata")
ok("successful fixture metadata expectations match committed stage goldens")
ok("successful fixture text and Markdown exports match committed layout goldens")
ok(f"foreign fixture manifests bind {foreign_package_count} package(s) to committed hashes")

if failures:
Expand Down
Loading