From 24dc5a6b2bb093587a15a02b4566497a73340d13 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 11:49:33 +0530 Subject: [PATCH] Assemble regular-grid table candidates Signed-off-by: docushell-admin --- .github/scripts/test_ci_workflow.py | 1 + .github/workflows/ci.yml | 1 + crates/ethos-cli/src/assembly.rs | 24 ++++- crates/ethos-cli/src/main.rs | 18 +++- schemas/table_model_validation.py | 117 ++++++++++++++++++++ schemas/test_table_model_validation.py | 143 +++++++++++++++++++++++++ schemas/validate_examples.py | 11 +- 7 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 schemas/table_model_validation.py create mode 100644 schemas/test_table_model_validation.py diff --git a/.github/scripts/test_ci_workflow.py b/.github/scripts/test_ci_workflow.py index ad37f62..bbc0a53 100644 --- a/.github/scripts/test_ci_workflow.py +++ b/.github/scripts/test_ci_workflow.py @@ -42,6 +42,7 @@ def test_schema_job_installs_jsonschema_and_validates_examples(self) -> None: self.assertIn('pip install "jsonschema>=4.18"', text) self.assertIn("python3 schemas/validate_examples.py", text) + self.assertIn("python3 schemas/test_table_model_validation.py", text) def test_ci_workflow_guard_is_run_by_ci(self) -> None: text = workflow_text() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 49dc4d4..a220cb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,6 +105,7 @@ jobs: python-version: "3.12" - run: pip install "jsonschema>=4.18" - run: python3 schemas/validate_examples.py + - run: python3 schemas/test_table_model_validation.py - name: Gate Zero result schema validation run: | python3 - <<'PY' diff --git a/crates/ethos-cli/src/assembly.rs b/crates/ethos-cli/src/assembly.rs index 5978507..b86d233 100644 --- a/crates/ethos-cli/src/assembly.rs +++ b/crates/ethos-cli/src/assembly.rs @@ -72,6 +72,7 @@ fn assemble_payload(extraction: Extraction) -> Result { extraction.warnings, layout.warnings, )?; + let tables = table_candidates(&extraction.pages, &spans)?; Ok(Payload { coordinate_system: CoordinateSystem { @@ -82,7 +83,7 @@ fn assemble_payload(extraction: Extraction) -> Result { pages: extraction.pages, elements, spans, - tables: Vec::new(), + tables, chunks: Vec::new(), regions, security_warnings, @@ -90,6 +91,27 @@ fn assemble_payload(extraction: Extraction) -> Result { }) } +fn table_candidates( + pages: &[ethos_core::model::Page], + spans: &[Span], +) -> Result, EthosError> { + let mut tables = Vec::new(); + let mut next_table_ordinal = 1u32; + + for page in pages { + let page_tables = ethos_tables::detect_regular_grid_candidates( + &page.id, + spans, + next_table_ordinal, + ðos_tables::TableCandidateConfig::default(), + )?; + next_table_ordinal += page_tables.len() as u32; + tables.extend(page_tables); + } + + Ok(tables) +} + struct DocumentHashes { config_sha256: String, payload_sha256: String, diff --git a/crates/ethos-cli/src/main.rs b/crates/ethos-cli/src/main.rs index 9cfb5d9..e9a1819 100644 --- a/crates/ethos-cli/src/main.rs +++ b/crates/ethos-cli/src/main.rs @@ -564,7 +564,7 @@ mod tests { } #[test] - fn table_candidate_probe_report_detects_regular_grid_without_mutating_document() { + fn assembly_emits_regular_grid_table_candidates() { let extraction = Extraction { pages: vec![Page { id: "p0001".to_string(), @@ -593,7 +593,20 @@ mod tests { ) .unwrap(); - assert!(doc.payload.tables.is_empty()); + doc.verify_integrity().unwrap(); + assert_eq!(doc.payload.tables.len(), 1); + let table = &doc.payload.tables[0]; + assert_eq!(table.id, "t0001"); + assert_eq!(table.page_refs, vec!["p0001"]); + assert_eq!(table.n_rows, 3); + assert_eq!(table.n_cols, 2); + assert_eq!(table.header_rows, 1); + assert_eq!(table.header_cols, 0); + assert_eq!(table.cells.len(), 6); + assert_eq!(table.cells[0].text, "Name"); + assert_eq!(table.cells[0].span_refs, vec!["s000001"]); + assert_eq!(table.cells[5].text, "12"); + assert_eq!(table.cells[5].span_refs, vec!["s000006"]); let bytes = table_candidate_probe_report_bytes(&doc).unwrap(); let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); @@ -612,7 +625,6 @@ mod tests { value["tables"][0]["markdown"], "| Name | Score |\n| --- | --- |\n| Alpha | 10 |\n| Beta | 12 |\n" ); - assert!(doc.payload.tables.is_empty()); } #[test] diff --git a/schemas/table_model_validation.py b/schemas/table_model_validation.py new file mode 100644 index 0000000..fbe7c60 --- /dev/null +++ b/schemas/table_model_validation.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +from typing import Any + + +def diagnose_table_model(payload: dict[str, Any], label: str = "document.example.json") -> list[str]: + """Return deterministic diagnostics for table/cell invariants JSON Schema cannot express.""" + diagnostics: list[str] = [] + for table in payload.get("tables", []): + table_id = table.get("id", "") + n_rows = table.get("n_rows") + n_cols = table.get("n_cols") + header_rows = table.get("header_rows", 0) + header_cols = table.get("header_cols", 0) + + if not isinstance(n_rows, int) or not isinstance(n_cols, int): + continue + if not isinstance(header_rows, int) or not isinstance(header_cols, int): + continue + + if header_rows > n_rows: + diagnostics.append( + f"{label}: table {table_id} header_rows {header_rows} exceeds n_rows {n_rows}" + ) + if header_cols > n_cols: + diagnostics.append( + f"{label}: table {table_id} header_cols {header_cols} exceeds n_cols {n_cols}" + ) + + coverage: dict[tuple[int, int], int] = {} + for cell_index, cell in enumerate(table.get("cells", [])): + row = cell.get("row") + col = cell.get("col") + row_span = cell.get("row_span") + col_span = cell.get("col_span") + if not all(isinstance(v, int) for v in [row, col, row_span, col_span]): + continue + + context = f"table {table_id} cell[{cell_index}]" + if row >= n_rows: + diagnostics.append( + f"{label}: {context} row {row} is outside n_rows {n_rows}" + ) + continue + if col >= n_cols: + diagnostics.append( + f"{label}: {context} col {col} is outside n_cols {n_cols}" + ) + continue + if row + row_span > n_rows: + diagnostics.append( + f"{label}: {context} row_span {row_span} from row {row} exceeds n_rows {n_rows}" + ) + continue + if col + col_span > n_cols: + diagnostics.append( + f"{label}: {context} col_span {col_span} from col {col} exceeds n_cols {n_cols}" + ) + continue + + if not bbox_contains(table.get("bbox"), cell.get("bbox")): + diagnostics.append(f"{label}: {context} bbox is outside table bbox") + + for r in range(row, row + row_span): + for c in range(col, col + col_span): + covered_by = coverage.get((r, c)) + if covered_by is not None: + diagnostics.append( + f"{label}: {context} overlaps covered slot ({r},{c}) " + f"already covered by cell[{covered_by}]" + ) + else: + coverage[(r, c)] = cell_index + + for r in range(n_rows): + for c in range(n_cols): + if (r, c) not in coverage: + diagnostics.append( + f"{label}: table {table_id} missing cell coverage at ({r},{c})" + ) + return diagnostics + + +def bbox_contains(outer: Any, inner: Any) -> bool: + if not (is_bbox(outer) and is_bbox(inner)): + return True + return ( + inner[0] >= outer[0] + and inner[1] >= outer[1] + and inner[2] <= outer[2] + and inner[3] <= outer[3] + ) + + +def is_bbox(value: Any) -> bool: + return ( + isinstance(value, list) + and len(value) == 4 + and all(isinstance(coord, int) for coord in value) + ) diff --git a/schemas/test_table_model_validation.py b/schemas/test_table_model_validation.py new file mode 100644 index 0000000..4dfc926 --- /dev/null +++ b/schemas/test_table_model_validation.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import copy +import json +from pathlib import Path +import unittest + +from table_model_validation import diagnose_table_model + +ROOT = Path(__file__).resolve().parents[1] +DOCUMENT_EXAMPLE = ROOT / "schemas" / "examples" / "document.example.json" + + +def valid_payload() -> dict: + return copy.deepcopy(json.loads(DOCUMENT_EXAMPLE.read_text(encoding="utf-8"))["payload"]) + + +def cell(row: int, col: int, bbox: list[int], row_span: int = 1, col_span: int = 1) -> dict: + return { + "row": row, + "col": col, + "row_span": row_span, + "col_span": col_span, + "bbox": bbox, + "text": f"{row},{col}", + } + + +class TableModelValidationTests(unittest.TestCase): + def test_valid_complete_grid_has_no_diagnostics(self) -> None: + self.assertEqual(diagnose_table_model(valid_payload(), "fixture"), []) + + def test_header_rows_must_fit_table(self) -> None: + payload = valid_payload() + payload["tables"][0]["header_rows"] = 3 + + self.assertEqual( + diagnose_table_model(payload, "fixture")[:1], + ["fixture: table t0001 header_rows 3 exceeds n_rows 2"], + ) + + def test_header_cols_must_fit_table(self) -> None: + payload = valid_payload() + payload["tables"][0]["header_cols"] = 3 + + self.assertEqual( + diagnose_table_model(payload, "fixture")[:1], + ["fixture: table t0001 header_cols 3 exceeds n_cols 2"], + ) + + def test_cell_row_must_fit_table_dimensions(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][0]["row"] = 2 + + self.assertIn( + "fixture: table t0001 cell[0] row 2 is outside n_rows 2", + diagnose_table_model(payload, "fixture"), + ) + + def test_cell_col_must_fit_table_dimensions(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][0]["col"] = 2 + + self.assertIn( + "fixture: table t0001 cell[0] col 2 is outside n_cols 2", + diagnose_table_model(payload, "fixture"), + ) + + def test_cell_row_span_must_fit_table_dimensions(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][2]["row_span"] = 2 + + self.assertIn( + "fixture: table t0001 cell[2] row_span 2 from row 1 exceeds n_rows 2", + diagnose_table_model(payload, "fixture"), + ) + + def test_cell_col_span_must_fit_table_dimensions(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][3]["col_span"] = 2 + + self.assertIn( + "fixture: table t0001 cell[3] col_span 2 from col 1 exceeds n_cols 2", + diagnose_table_model(payload, "fixture"), + ) + + def test_cell_bbox_must_stay_inside_table_bbox(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][0]["bbox"] = [-1, 0, 100, 50] + + self.assertIn( + "fixture: table t0001 cell[0] bbox is outside table bbox", + diagnose_table_model(payload, "fixture"), + ) + + def test_overlapping_cell_coverage_fails_closed(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"][1]["col"] = 0 + + self.assertIn( + "fixture: table t0001 cell[1] overlaps covered slot (0,0) already covered by cell[0]", + diagnose_table_model(payload, "fixture"), + ) + + def test_missing_cell_coverage_fails_closed(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"] = payload["tables"][0]["cells"][:-1] + + self.assertIn( + "fixture: table t0001 missing cell coverage at (1,1)", + diagnose_table_model(payload, "fixture"), + ) + + def test_spanned_cells_can_cover_multiple_grid_slots(self) -> None: + payload = valid_payload() + payload["tables"][0]["cells"] = [ + cell(0, 0, [7200, 13000, 54000, 16500], col_span=2), + cell(1, 0, [7200, 16500, 30600, 20000]), + cell(1, 1, [30600, 16500, 54000, 20000]), + ] + + self.assertEqual(diagnose_table_model(copy.deepcopy(payload), "fixture"), []) + + +if __name__ == "__main__": + unittest.main() diff --git a/schemas/validate_examples.py b/schemas/validate_examples.py index 0c3d9d2..43cf52f 100644 --- a/schemas/validate_examples.py +++ b/schemas/validate_examples.py @@ -30,6 +30,7 @@ from pathlib import Path from font_policy_validation import diagnose_font_policy +from table_model_validation import diagnose_table_model try: from jsonschema import Draft202012Validator as Validator @@ -151,9 +152,14 @@ def check_ref(kind: str, ref: str, ctx: str) -> None: for t in p["tables"]: for r in t["page_refs"]: check_ref("page", r, f"table {t['id']}") + for r in t.get("warning_refs", []): + check_ref("warning", r, f"table {t['id']}") for c in t["cells"]: + cell_ctx = f"table {t['id']} cell ({c['row']},{c['col']})" + for r in c.get("span_refs", []): + check_ref("span", r, cell_ctx) for r in c.get("element_refs", []): - check_ref("element", r, f"table {t['id']} cell ({c['row']},{c['col']})") + check_ref("element", r, cell_ctx) for ch in p["chunks"]: for r in ch["element_refs"]: check_ref("element", r, f"chunk {ch['id']}") @@ -171,6 +177,9 @@ def check_ref(kind: str, ref: str, ctx: str) -> None: if "region_ref" in w: check_ref("region", w["region_ref"], f"warning {w['id']}") +for diagnostic in diagnose_table_model(p): + fail(diagnostic) + # --- derivability: chunks.example.jsonl must be EXACTLY what `ethos rag chunk` derives # from document.example.json (PRD ยง7: every derived artifact is reproducible from the # canonical JSON + config). Catches drift between examples (P2 reviewer finding).