From e35b066501fa761415749558ea6694c241eb831d Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Tue, 16 Jun 2026 13:40:25 +0530 Subject: [PATCH] Add OpenDataLoader malformed input checks Signed-off-by: docushell-admin --- README.md | 5 +++- docs/demos/verify-alpha.md | 23 ++++++++++++++- docs/execution-status.md | 4 +-- examples/verify/README.md | 5 ++++ examples/verify/check_verify_alpha.py | 14 ++++++++++ .../verify/opendataloader_malformed_bbox.json | 28 +++++++++++++++++++ .../verify/opendataloader_unknown_page.json | 28 +++++++++++++++++++ 7 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 examples/verify/opendataloader_malformed_bbox.json create mode 100644 examples/verify/opendataloader_unknown_page.json diff --git a/README.md b/README.md index 04f4aeb..09fed1b 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,8 @@ That command builds the CLI and checks the alpha grounding loop across: - synthetic OpenDataLoader-style JSON - pinned real OpenDataLoader 2.4.7 JSON fixtures - grounded, ungrounded, not-found, stale-fingerprint, and capability-limited citation cases -- malformed citation inputs that must fail with usage diagnostics +- malformed citation inputs and malformed OpenDataLoader-style inputs that must fail with usage + diagnostics - byte-identical repeated verification reports for the checked-in fixtures - deterministic native crop descriptor JSON artifacts @@ -151,6 +152,8 @@ ok real-opendataloader-grounded matches fixtures/foreign/opendataloader/real/ ok real-opendataloader-ungrounded matches fixtures/foreign/opendataloader/real/expected.ungrounded.verification_report.json ok invalid-table-cell-citation exits 2 with expected usage diagnostic ok invalid-bbox-citation exits 2 with expected usage diagnostic +ok opendataloader-malformed-bbox-input exits 2 with expected usage diagnostic +ok opendataloader-unknown-page-input exits 2 with expected usage diagnostic ok native-ungrounded-summary summary includes expected diagnostics ok native-grounded-crops crop descriptors validate against schemas/ethos-crop-descriptor.schema.json diff --git a/docs/demos/verify-alpha.md b/docs/demos/verify-alpha.md index 505c804..2e0527b 100644 --- a/docs/demos/verify-alpha.md +++ b/docs/demos/verify-alpha.md @@ -12,7 +12,8 @@ repeatable `make verify-alpha` path: - OpenDataLoader-style JSON can enter the same verification loop through a grounding adapter - real pinned OpenDataLoader 2.4.7 output has both grounded and ungrounded citation cases - native and synthetic OpenDataLoader fixtures cover missing cited elements -- malformed citation inputs return usage diagnostics with exit code `2` +- malformed citation inputs and malformed OpenDataLoader-style grounding inputs return usage + diagnostics with exit code `2` - `--fail-on-ungrounded` turns the report into a CI/agent gate with exit code `1` when evidence is not fully grounded - native Ethos verification can emit deterministic crop descriptor artifacts with `--crop-dir` - every demo report is compared against a golden and regenerated twice to check byte-identical output @@ -241,6 +242,26 @@ ethos verify schemas/examples/document.example.json \ Expected outcome: exit code `2` with a diagnostic that the bbox citation requires a page unless another target locator is present. +## Malformed OpenDataLoader-Style Inputs + +The harness also checks adapter input validation failures: + +```bash +ethos verify examples/verify/opendataloader_malformed_bbox.json \ + --grounding opendataloader-json \ + --citations examples/verify/opendataloader_grounded_citations.json +``` + +Expected outcome: exit code `2` with a diagnostic that the bbox is malformed. + +```bash +ethos verify examples/verify/opendataloader_unknown_page.json \ + --grounding opendataloader-json \ + --citations examples/verify/opendataloader_grounded_citations.json +``` + +Expected outcome: exit code `2` with a diagnostic that an element references an undeclared page. + ## Crop Descriptors Native Ethos document grounding can emit deterministic crop descriptor JSON files for each diff --git a/docs/execution-status.md b/docs/execution-status.md index d9a8924..e306960 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -16,9 +16,9 @@ The committed implementation now includes: - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. -- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, not-found, stale-fingerprint, capability-limited, malformed-citation, and summary-format reject paths. +- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, not-found, stale-fingerprint, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. -- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. +- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, usage diagnostics for malformed citations and malformed OpenDataLoader-style inputs, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms. Still absent or not claimable: reproducible benchmark result JSON, executed competitor comparisons, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and full frozen-corpus multi-platform determinism evidence. diff --git a/examples/verify/README.md b/examples/verify/README.md index d656afd..50f01fc 100644 --- a/examples/verify/README.md +++ b/examples/verify/README.md @@ -111,6 +111,11 @@ Malformed citations are covered as usage errors. `invalid_table_cell_citations.j `2` because a table-cell claim is missing `table_id` and `cell`. `invalid_bbox_citations.json` must exit `2` because a bbox locator is missing a page or another target locator. +Malformed OpenDataLoader-style grounding inputs are also covered as usage errors. +`opendataloader_malformed_bbox.json` must exit `2` because its bbox is inverted. +`opendataloader_unknown_page.json` must exit `2` because an element references a page that +the input does not declare. + The OpenDataLoader fixtures are synthetic and limited to the adapter's documented alpha subset. They are not real pinned OpenDataLoader artifacts. Golden reports live in `examples/verify/goldens/` and are covered by the CLI verification test. diff --git a/examples/verify/check_verify_alpha.py b/examples/verify/check_verify_alpha.py index 80f5ae7..75308bf 100644 --- a/examples/verify/check_verify_alpha.py +++ b/examples/verify/check_verify_alpha.py @@ -79,6 +79,20 @@ "citations": "examples/verify/invalid_bbox_citations.json", "stderr_contains": "citation bbox requires page unless another target locator is present", }, + { + "name": "opendataloader-malformed-bbox-input", + "input": "examples/verify/opendataloader_malformed_bbox.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_grounded_citations.json", + "stderr_contains": "opendataloader-json adapter: bbox is malformed (x0>x1 or y0>y1)", + }, + { + "name": "opendataloader-unknown-page-input", + "input": "examples/verify/opendataloader_unknown_page.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_grounded_citations.json", + "stderr_contains": "opendataloader-json adapter: element.page references unknown page", + }, ] SUMMARY_CASES = [ diff --git a/examples/verify/opendataloader_malformed_bbox.json b/examples/verify/opendataloader_malformed_bbox.json new file mode 100644 index 0000000..82f5fd8 --- /dev/null +++ b/examples/verify/opendataloader_malformed_bbox.json @@ -0,0 +1,28 @@ +{ + "_comment": "Synthetic malformed OpenDataLoader-style fixture for WS-VERIFY-ALPHA usage diagnostics. It is not real pinned ODL output.", + "tool": { + "name": "opendataloader-pdf", + "version": "0.0.0-synthetic" + }, + "pages": [ + { + "number": 1, + "width": 612.0, + "height": 792.0 + } + ], + "elements": [ + { + "id": "odl-e2", + "page": 1, + "bbox": [ + 540.0, + 101.0, + 72.0, + 115.0 + ], + "type": "Paragraph", + "text": "Revenue grew to $12.4M in Q3 2025." + } + ] +} diff --git a/examples/verify/opendataloader_unknown_page.json b/examples/verify/opendataloader_unknown_page.json new file mode 100644 index 0000000..4d6dee0 --- /dev/null +++ b/examples/verify/opendataloader_unknown_page.json @@ -0,0 +1,28 @@ +{ + "_comment": "Synthetic malformed OpenDataLoader-style fixture for WS-VERIFY-ALPHA usage diagnostics. It is not real pinned ODL output.", + "tool": { + "name": "opendataloader-pdf", + "version": "0.0.0-synthetic" + }, + "pages": [ + { + "number": 1, + "width": 612.0, + "height": 792.0 + } + ], + "elements": [ + { + "id": "odl-e2", + "page": 2, + "bbox": [ + 72.0, + 101.0, + 540.0, + 115.0 + ], + "type": "Paragraph", + "text": "Revenue grew to $12.4M in Q3 2025." + } + ] +}