diff --git a/crates/ethos-cli/tests/verify.rs b/crates/ethos-cli/tests/verify.rs index c270e3c..96a8ac6 100644 --- a/crates/ethos-cli/tests/verify.rs +++ b/crates/ethos-cli/tests/verify.rs @@ -127,6 +127,54 @@ fn odl_example() -> PathBuf { repo_root().join("examples/verify/opendataloader.json") } +fn verify_alpha_report_cases() -> Vec<(String, Vec, PathBuf)> { + let root = repo_root(); + let inventory = json_file(root.join("examples/verify/cases.json")); + let report_cases = inventory["report_cases"] + .as_array() + .expect("verify-alpha report_cases is an array"); + + report_cases + .iter() + .map(|case| { + let name = case["name"] + .as_str() + .expect("verify-alpha case name is a string") + .to_string(); + let mut args = vec![ + "verify".to_string(), + root.join( + case["input"] + .as_str() + .expect("verify-alpha case input is a string"), + ) + .display() + .to_string(), + ]; + if let Some(grounding) = case.get("grounding").and_then(Value::as_str) { + args.push("--grounding".to_string()); + args.push(grounding.to_string()); + } + args.push("--citations".to_string()); + args.push( + root.join( + case["citations"] + .as_str() + .expect("verify-alpha case citations is a string"), + ) + .display() + .to_string(), + ); + let expected = root.join( + case["golden"] + .as_str() + .expect("verify-alpha case golden is a string"), + ); + (name, args, expected) + }) + .collect() +} + #[test] fn verify_alpha_schema_report_example_matches_cli_output() { let root = repo_root(); @@ -147,99 +195,7 @@ fn verify_alpha_schema_report_example_matches_cli_output() { #[test] fn verify_alpha_demo_reports_match_goldens() { - let root = repo_root(); - let cases: [(&str, Vec, PathBuf); 6] = [ - ( - "native-grounded", - vec![ - "verify".to_string(), - root.join("schemas/examples/document.example.json") - .display() - .to_string(), - "--citations".to_string(), - root.join("examples/verify/native_grounded_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/native_grounded_report.json"), - ), - ( - "opendataloader-grounded", - vec![ - "verify".to_string(), - root.join("examples/verify/opendataloader.json") - .display() - .to_string(), - "--grounding".to_string(), - "opendataloader-json".to_string(), - "--citations".to_string(), - root.join("examples/verify/opendataloader_grounded_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/opendataloader_grounded_report.json"), - ), - ( - "native-split-quote", - vec![ - "verify".to_string(), - root.join("examples/verify/native_split_quote_document.json") - .display() - .to_string(), - "--citations".to_string(), - root.join("examples/verify/native_split_quote_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/native_split_quote_report.json"), - ), - ( - "native-non-v1-claims", - vec![ - "verify".to_string(), - root.join("schemas/examples/document.example.json") - .display() - .to_string(), - "--citations".to_string(), - root.join("examples/verify/native_non_v1_claims_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/native_non_v1_claims_report.json"), - ), - ( - "native-stale", - vec![ - "verify".to_string(), - root.join("schemas/examples/document.example.json") - .display() - .to_string(), - "--citations".to_string(), - root.join("examples/verify/native_stale_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/native_stale_report.json"), - ), - ( - "opendataloader-capability-limited", - vec![ - "verify".to_string(), - root.join("examples/verify/opendataloader_no_tables.json") - .display() - .to_string(), - "--grounding".to_string(), - "opendataloader-json".to_string(), - "--citations".to_string(), - root.join("examples/verify/opendataloader_table_cell_citations.json") - .display() - .to_string(), - ], - root.join("examples/verify/goldens/opendataloader_capability_limited_report.json"), - ), - ]; - - for (name, args, expected_path) in cases { + for (name, args, expected_path) in verify_alpha_report_cases() { let args = args.iter().map(String::as_str).collect::>(); let actual = parse_success(&args); let expected = json_file(expected_path); diff --git a/docs/execution-status.md b/docs/execution-status.md index 2ece0a5..5556490 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -16,9 +16,9 @@ The committed implementation now includes: - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. -- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, not-found, stale-fingerprint, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. +- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. -- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, usage diagnostics for malformed citations and malformed OpenDataLoader-style inputs, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. +- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms. Still absent or not claimable: public benchmark reports, public competitor-comparison claims, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and claim-audit approval for any public result wording. @@ -52,7 +52,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | Layout groundwork | Landed: basic paragraph text blocks and simple column reading order over quantized spans | Tables, headings, lists, rotation/quirk handling, and confidence policy remain future work | | Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs | Bundled fallback asset hashing and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | -| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, and demo fixtures | Still needed: evidence matching against richer source structures, semantic/arithmetic claim handling by explicit non-v1 design, real OpenDataLoader table-cell grounding, broader adapter hardening against real output, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | +| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: real OpenDataLoader table-cell grounding, additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | | WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening | ## PM Rule diff --git a/docs/roadmap.md b/docs/roadmap.md index 0ebb7d5..39fee24 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -13,7 +13,7 @@ Current PM status and blockers: `docs/execution-status.md`. | --- | --- | --- | --- | | Week 0 | pre-kickoff | ADRs, governance, corpus freeze, CI bootstrap, competitor pins | All 11 rows done; clock starts | | A | weeks 1-8 | Contracts (5 schemas, c14n, deterministic profile), trust-boundary artifacts (`GroundingSource`, verification schemas, OpenDataLoader adapter stub, `ethos verify` CLI stub), PDFium Phase 1 spike, harness + competitor adapters, CLI skeleton | **Gate Zero**: ADR-0005 is accepted as `PROCEED` for internal Milestone B continuation. This is not public benchmark, release, package, production, or claim approval. | -| B | weeks 9-14 | **`ethos verify` alpha first**: native Ethos JSON + OpenDataLoader verification demo, stale fingerprint checks, capability-limited reports, deterministic evidence matching; then reading order, blocks, headings, lists, Markdown/text exporters, Python wheel scaffold, quality dashboard, Windows x64 nightly determinism | 13-B exit checklist | +| B | weeks 9-14 | **`ethos verify` alpha first**: native Ethos JSON + synthetic and pinned real OpenDataLoader verification demos, stale fingerprint checks, capability-limited reports, deterministic evidence matching including split-quote coverage, explicit unsupported non-v1 claim reporting, adapter structure diagnostics; then reading order, blocks, headings, lists, Markdown/text exporters, Python wheel scaffold, quality dashboard, Windows x64 nightly determinism | 13-B exit checklist | | C | weeks 15-22 | Simple/bordered tables; RAG chunker + citations; non-text region coordinates; security report + default-chunk exclusion; debug overlay; internal benchmark snapshot | 13-C exit + first checkpoint | | D | weeks 23-30 | `verify_citations` v1; crop API; sandbox/subprocess backend; Node beta and MCP experimental only if staffed or accepted by release-scope ADR | 13-D exit | | E | weeks 31-40 | Public benchmark report (reproducible, labeled tiers); PDFium Phase 2 project-maintained builds; stable CLI/Python docs; proof-of-trust demos; **Public Beta** | Release 1 claim audit + public-beta checkpoint | diff --git a/examples/verify/README.md b/examples/verify/README.md index 46ce0b0..923839c 100644 --- a/examples/verify/README.md +++ b/examples/verify/README.md @@ -2,6 +2,33 @@ This directory contains verify-alpha fixtures, citations, and golden reports. +`cases.json` is the executable verify-alpha case inventory. `make verify-alpha` fails if a +listed fixture path is missing, if a report golden is not covered by the inventory, if the +real OpenDataLoader fixture manifest hashes drift, or if this README stops naming an inventory +case. + +## Verify-Alpha Case Inventory + +Report cases: + +| Case | Coverage | +| --- | --- | +| `native-grounded` | Native quote, table-cell, and presence grounding. | +| `opendataloader-grounded` | Synthetic OpenDataLoader-style grounding with declared capability limits. | +| `native-split-quote` | Adjacent native text evidence matching. | +| `native-non-v1-claims` | Explicit unsupported non-v1 claim reporting. | +| `native-ungrounded` | Native mismatch and missing element diagnostics. | +| `opendataloader-not-found` | Synthetic OpenDataLoader-style missing element diagnostics. | +| `native-stale` | Fingerprint staleness handling. | +| `opendataloader-capability-limited` | Capability-limited table-cell reporting. | +| `real-opendataloader-grounded` | Pinned real OpenDataLoader grounded fixture. | +| `real-opendataloader-ungrounded` | Pinned real OpenDataLoader ungrounded fixture. | + +Usage-error cases: `invalid-table-cell-citation`, `invalid-bbox-citation`, +`opendataloader-malformed-bbox-input`, and `opendataloader-unknown-page-input`. + +Summary cases: `native-ungrounded-summary`. + ## Native Ethos Grounding ```bash diff --git a/examples/verify/cases.json b/examples/verify/cases.json new file mode 100644 index 0000000..7490f37 --- /dev/null +++ b/examples/verify/cases.json @@ -0,0 +1,119 @@ +{ + "schema_version": 1, + "report_cases": [ + { + "name": "native-grounded", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/native_grounded_citations.json", + "golden": "examples/verify/goldens/native_grounded_report.json" + }, + { + "name": "opendataloader-grounded", + "input": "examples/verify/opendataloader.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_grounded_citations.json", + "golden": "examples/verify/goldens/opendataloader_grounded_report.json" + }, + { + "name": "native-split-quote", + "input": "examples/verify/native_split_quote_document.json", + "citations": "examples/verify/native_split_quote_citations.json", + "golden": "examples/verify/goldens/native_split_quote_report.json" + }, + { + "name": "native-non-v1-claims", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/native_non_v1_claims_citations.json", + "golden": "examples/verify/goldens/native_non_v1_claims_report.json" + }, + { + "name": "native-ungrounded", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/native_ungrounded_citations.json", + "golden": "examples/verify/goldens/native_ungrounded_report.json" + }, + { + "name": "opendataloader-not-found", + "input": "examples/verify/opendataloader.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_not_found_citations.json", + "golden": "examples/verify/goldens/opendataloader_not_found_report.json" + }, + { + "name": "native-stale", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/native_stale_citations.json", + "golden": "examples/verify/goldens/native_stale_report.json" + }, + { + "name": "opendataloader-capability-limited", + "input": "examples/verify/opendataloader_no_tables.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_table_cell_citations.json", + "golden": "examples/verify/goldens/opendataloader_capability_limited_report.json" + }, + { + "name": "real-opendataloader-grounded", + "input": "fixtures/foreign/opendataloader/real/opendataloader-output.json", + "grounding": "opendataloader-json", + "citations": "fixtures/foreign/opendataloader/real/citations.json", + "golden": "fixtures/foreign/opendataloader/real/expected.verification_report.json" + }, + { + "name": "real-opendataloader-ungrounded", + "input": "fixtures/foreign/opendataloader/real/opendataloader-output.json", + "grounding": "opendataloader-json", + "citations": "fixtures/foreign/opendataloader/real/ungrounded_citations.json", + "golden": "fixtures/foreign/opendataloader/real/expected.ungrounded.verification_report.json" + } + ], + "usage_error_cases": [ + { + "name": "invalid-table-cell-citation", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/invalid_table_cell_citations.json", + "stderr_contains": "table_cell citation must include table_id and cell" + }, + { + "name": "invalid-bbox-citation", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/invalid_bbox_citations.json", + "stderr_contains": "citation bbox requires page unless another target locator is present" + }, + { + "name": "opendataloader-malformed-bbox-input", + "input": "examples/verify/opendataloader_malformed_bbox.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_grounded_citations.json", + "stderr_contains": "opendataloader-json adapter: bbox is malformed (x0>x1 or y0>y1)" + }, + { + "name": "opendataloader-unknown-page-input", + "input": "examples/verify/opendataloader_unknown_page.json", + "grounding": "opendataloader-json", + "citations": "examples/verify/opendataloader_grounded_citations.json", + "stderr_contains": "opendataloader-json adapter: element.page references unknown page" + } + ], + "summary_cases": [ + { + "name": "native-ungrounded-summary", + "input": "schemas/examples/document.example.json", + "citations": "examples/verify/native_ungrounded_citations.json", + "expected_exit": 1, + "fail_on_ungrounded": true, + "stdout_contains": [ + "ethos verify summary\n", + "verification_config_sha256: 4bb224166a04a25fed2dd3ecdb9638ddcc5b398658532b73f1c0547e4983d0b0\n", + "all_evidence_grounded: false\n", + "grounding_capabilities: spans=true,char_offsets=true,tables=true,fingerprint=true,coordinate_origin=top-left,crop_support=false\n", + "checks_not_found: 1\n", + "checks_mismatch: 1\n", + "- v0001 status=mismatch reason=text_mismatch kind=quote locator=page:p0001;element_id:e000002 match_method=normalized_text_contains\n", + " diagnostic: target resolved, but target text did not match claimed text under normalized_text_contains; no semantic inference was attempted\n", + "- v0002 status=not_found reason=element_not_found kind=presence locator=element_id:missing-element match_method=none\n", + " diagnostic: element_id locator did not resolve in the grounding source\n" + ] + } + ] +} diff --git a/examples/verify/check_verify_alpha.py b/examples/verify/check_verify_alpha.py index d7e4735..bdba44e 100644 --- a/examples/verify/check_verify_alpha.py +++ b/examples/verify/check_verify_alpha.py @@ -19,6 +19,7 @@ import argparse import difflib +import hashlib import json import shutil import subprocess @@ -26,145 +27,6 @@ from pathlib import Path -CASES = [ - { - "name": "native-grounded", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/native_grounded_citations.json", - "golden": "examples/verify/goldens/native_grounded_report.json", - }, - { - "name": "opendataloader-grounded", - "input": "examples/verify/opendataloader.json", - "grounding": "opendataloader-json", - "citations": "examples/verify/opendataloader_grounded_citations.json", - "golden": "examples/verify/goldens/opendataloader_grounded_report.json", - }, - { - "name": "native-split-quote", - "input": "examples/verify/native_split_quote_document.json", - "citations": "examples/verify/native_split_quote_citations.json", - "golden": "examples/verify/goldens/native_split_quote_report.json", - }, - { - "name": "native-non-v1-claims", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/native_non_v1_claims_citations.json", - "golden": "examples/verify/goldens/native_non_v1_claims_report.json", - }, - { - "name": "native-ungrounded", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/native_ungrounded_citations.json", - "golden": "examples/verify/goldens/native_ungrounded_report.json", - }, - { - "name": "opendataloader-not-found", - "input": "examples/verify/opendataloader.json", - "grounding": "opendataloader-json", - "citations": "examples/verify/opendataloader_not_found_citations.json", - "golden": "examples/verify/goldens/opendataloader_not_found_report.json", - }, - { - "name": "native-stale", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/native_stale_citations.json", - "golden": "examples/verify/goldens/native_stale_report.json", - }, - { - "name": "opendataloader-capability-limited", - "input": "examples/verify/opendataloader_no_tables.json", - "grounding": "opendataloader-json", - "citations": "examples/verify/opendataloader_table_cell_citations.json", - "golden": "examples/verify/goldens/opendataloader_capability_limited_report.json", - }, - { - "name": "real-opendataloader-grounded", - "input": "fixtures/foreign/opendataloader/real/opendataloader-output.json", - "grounding": "opendataloader-json", - "citations": "fixtures/foreign/opendataloader/real/citations.json", - "golden": "fixtures/foreign/opendataloader/real/expected.verification_report.json", - }, - { - "name": "real-opendataloader-ungrounded", - "input": "fixtures/foreign/opendataloader/real/opendataloader-output.json", - "grounding": "opendataloader-json", - "citations": "fixtures/foreign/opendataloader/real/ungrounded_citations.json", - "golden": "fixtures/foreign/opendataloader/real/expected.ungrounded.verification_report.json", - }, -] - -USAGE_ERROR_CASES = [ - { - "name": "invalid-table-cell-citation", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/invalid_table_cell_citations.json", - "stderr_contains": "table_cell citation must include table_id and cell", - }, - { - "name": "invalid-bbox-citation", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/invalid_bbox_citations.json", - "stderr_contains": "citation bbox requires page unless another target locator is present", - }, - { - "name": "opendataloader-malformed-bbox-input", - "input": "examples/verify/opendataloader_malformed_bbox.json", - "grounding": "opendataloader-json", - "citations": "examples/verify/opendataloader_grounded_citations.json", - "stderr_contains": "opendataloader-json adapter: bbox is malformed (x0>x1 or y0>y1)", - }, - { - "name": "opendataloader-unknown-page-input", - "input": "examples/verify/opendataloader_unknown_page.json", - "grounding": "opendataloader-json", - "citations": "examples/verify/opendataloader_grounded_citations.json", - "stderr_contains": "opendataloader-json adapter: element.page references unknown page", - }, -] - -SUMMARY_CASES = [ - { - "name": "native-ungrounded-summary", - "input": "schemas/examples/document.example.json", - "citations": "examples/verify/native_ungrounded_citations.json", - "expected_exit": 1, - "fail_on_ungrounded": True, - "stdout_contains": [ - "ethos verify summary\n", - ( - "verification_config_sha256: " - "4bb224166a04a25fed2dd3ecdb9638ddcc5b398658532b73f1c0547e4983d0b0\n" - ), - "all_evidence_grounded: false\n", - ( - "grounding_capabilities: " - "spans=true,char_offsets=true,tables=true,fingerprint=true," - "coordinate_origin=top-left,crop_support=false\n" - ), - "checks_not_found: 1\n", - "checks_mismatch: 1\n", - ( - "- v0001 status=mismatch reason=text_mismatch kind=quote " - "locator=page:p0001;element_id:e000002 " - "match_method=normalized_text_contains\n" - ), - ( - " diagnostic: target resolved, but target text did not match " - "claimed text under normalized_text_contains; no semantic inference was attempted\n" - ), - ( - "- v0002 status=not_found reason=element_not_found kind=presence " - "locator=element_id:missing-element match_method=none\n" - ), - ( - " diagnostic: element_id locator did not resolve in the grounding source\n" - ), - ], - }, -] - - def parse_args(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--repo-root", type=Path, required=True) @@ -185,10 +47,132 @@ def load_json(path): return json.load(handle) +def load_case_inventory(repo_root): + inventory_path = repo_root / "examples/verify/cases.json" + inventory = load_json(inventory_path) + if inventory.get("schema_version") != 1: + raise SystemExit(f"{relative(inventory_path, repo_root)} has unsupported schema_version") + for key in ["report_cases", "usage_error_cases", "summary_cases"]: + if not isinstance(inventory.get(key), list): + raise SystemExit(f"{relative(inventory_path, repo_root)} missing {key} list") + return inventory + + def pretty_json(value): return json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False) + "\n" +def sha256_file(path): + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def validate_unique_case_names(groups): + seen = {} + for group_name, cases in groups.items(): + for case in cases: + name = case.get("name") + if not isinstance(name, str) or not name: + raise SystemExit(f"{group_name} contains a case without a non-empty name") + if name in seen: + raise SystemExit(f"{name} appears in both {seen[name]} and {group_name}") + seen[name] = group_name + + +def validate_case_paths(cases, repo_root, fields): + for case in cases: + for field in fields: + if field not in case: + raise SystemExit(f"{case['name']} missing {field}") + path = repo_root / case[field] + if not path.is_file(): + raise SystemExit(f"{case['name']} missing {field}: {relative(path, repo_root)}") + + +def validate_golden_inventory(report_cases, repo_root): + expected = sorted(str(Path(case["golden"])) for case in report_cases) + actual = sorted( + str(path.relative_to(repo_root)) + for path in [ + *(repo_root / "examples/verify/goldens").glob("*_report.json"), + *(repo_root / "fixtures/foreign/opendataloader/real").glob( + "expected*.verification_report.json" + ), + ] + ) + if expected != actual: + diff = "\n".join( + difflib.unified_diff( + [f"{path}\n" for path in expected], + [f"{path}\n" for path in actual], + fromfile="examples/verify/cases.json report_cases[*].golden", + tofile="tracked verify-alpha report goldens", + ) + ) + raise SystemExit(f"verify-alpha golden inventory drift\n{diff}") + print(f"ok verify-alpha report inventory covers {len(report_cases)} goldens") + + +def validate_readme_inventory(inventory, repo_root): + readme_path = repo_root / "examples/verify/README.md" + readme = readme_path.read_text(encoding="utf-8") + missing = [] + for group in ["report_cases", "usage_error_cases", "summary_cases"]: + for case in inventory[group]: + marker = f"`{case['name']}`" + if marker not in readme: + missing.append(case["name"]) + if missing: + names = ", ".join(missing) + raise SystemExit( + f"{relative(readme_path, repo_root)} is missing verify-alpha case names: {names}" + ) + print("ok verify-alpha README names every inventory case") + + +def validate_real_opendataloader_manifest(repo_root): + fixture_dir = repo_root / "fixtures/foreign/opendataloader/real" + manifest_path = fixture_dir / "manifest.json" + manifest = load_json(manifest_path) + checks = [ + ("source_pdf", "source_pdf_sha256"), + ("output_json", "output_json_sha256"), + ] + for path_key, hash_key in checks: + rel = manifest.get(path_key) + expected = manifest.get(hash_key) + if not isinstance(rel, str) or not isinstance(expected, str): + raise SystemExit(f"{relative(manifest_path, repo_root)} missing {path_key}/{hash_key}") + path = fixture_dir / rel + if not path.is_file(): + raise SystemExit(f"{relative(manifest_path, repo_root)} references missing {rel}") + actual = sha256_file(path) + if actual != expected: + raise SystemExit( + f"{relative(path, repo_root)} sha256 drift: expected {expected}, got {actual}" + ) + print("ok real OpenDataLoader fixture manifest hashes match pinned files") + + +def validate_case_inventory(inventory, repo_root): + validate_unique_case_names( + { + "report_cases": inventory["report_cases"], + "usage_error_cases": inventory["usage_error_cases"], + "summary_cases": inventory["summary_cases"], + } + ) + validate_case_paths(inventory["report_cases"], repo_root, ["input", "citations", "golden"]) + validate_case_paths(inventory["usage_error_cases"], repo_root, ["input", "citations"]) + validate_case_paths(inventory["summary_cases"], repo_root, ["input", "citations"]) + validate_golden_inventory(inventory["report_cases"], repo_root) + validate_readme_inventory(inventory, repo_root) + validate_real_opendataloader_manifest(repo_root) + + def run_verify(command, repo_root, name): print("$ " + " ".join(str(part) for part in command), flush=True) result = subprocess.run(command, cwd=repo_root, capture_output=True, check=False) @@ -451,11 +435,14 @@ def main(): args.out_dir.mkdir(parents=True, exist_ok=True) - for case in CASES: + inventory = load_case_inventory(args.repo_root) + validate_case_inventory(inventory, args.repo_root) + + for case in inventory["report_cases"]: verify_case(case, args) - for case in USAGE_ERROR_CASES: + for case in inventory["usage_error_cases"]: verify_usage_error_case(case, args) - for case in SUMMARY_CASES: + for case in inventory["summary_cases"]: verify_summary_case(case, args) verify_crop_descriptor_case(args)