Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/test_ci_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_schema_job_installs_jsonschema_and_validates_examples(self) -> None:

self.assertIn('pip install "jsonschema>=4.18"', text)
self.assertIn("python3 schemas/validate_examples.py", text)
self.assertIn("python3 schemas/test_table_model_validation.py", text)

def test_ci_workflow_guard_is_run_by_ci(self) -> None:
text = workflow_text()
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ jobs:
python-version: "3.12"
- run: pip install "jsonschema>=4.18"
- run: python3 schemas/validate_examples.py
- run: python3 schemas/test_table_model_validation.py
- name: Gate Zero result schema validation
run: |
python3 - <<'PY'
Expand Down
24 changes: 23 additions & 1 deletion crates/ethos-cli/src/assembly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ fn assemble_payload(extraction: Extraction) -> Result<Payload, EthosError> {
extraction.warnings,
layout.warnings,
)?;
let tables = table_candidates(&extraction.pages, &spans)?;

Ok(Payload {
coordinate_system: CoordinateSystem {
Expand All @@ -82,14 +83,35 @@ fn assemble_payload(extraction: Extraction) -> Result<Payload, EthosError> {
pages: extraction.pages,
elements,
spans,
tables: Vec::new(),
tables,
chunks: Vec::new(),
regions,
security_warnings,
parser_warnings,
})
}

fn table_candidates(
pages: &[ethos_core::model::Page],
spans: &[Span],
) -> Result<Vec<ethos_core::model::Table>, EthosError> {
let mut tables = Vec::new();
let mut next_table_ordinal = 1u32;

for page in pages {
let page_tables = ethos_tables::detect_regular_grid_candidates(
&page.id,
spans,
next_table_ordinal,
&ethos_tables::TableCandidateConfig::default(),
)?;
next_table_ordinal += page_tables.len() as u32;
tables.extend(page_tables);
}

Ok(tables)
}

struct DocumentHashes {
config_sha256: String,
payload_sha256: String,
Expand Down
18 changes: 15 additions & 3 deletions crates/ethos-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ mod tests {
}

#[test]
fn table_candidate_probe_report_detects_regular_grid_without_mutating_document() {
fn assembly_emits_regular_grid_table_candidates() {
let extraction = Extraction {
pages: vec![Page {
id: "p0001".to_string(),
Expand Down Expand Up @@ -593,7 +593,20 @@ mod tests {
)
.unwrap();

assert!(doc.payload.tables.is_empty());
doc.verify_integrity().unwrap();
assert_eq!(doc.payload.tables.len(), 1);
let table = &doc.payload.tables[0];
assert_eq!(table.id, "t0001");
assert_eq!(table.page_refs, vec!["p0001"]);
assert_eq!(table.n_rows, 3);
assert_eq!(table.n_cols, 2);
assert_eq!(table.header_rows, 1);
assert_eq!(table.header_cols, 0);
assert_eq!(table.cells.len(), 6);
assert_eq!(table.cells[0].text, "Name");
assert_eq!(table.cells[0].span_refs, vec!["s000001"]);
assert_eq!(table.cells[5].text, "12");
assert_eq!(table.cells[5].span_refs, vec!["s000006"]);

let bytes = table_candidate_probe_report_bytes(&doc).unwrap();
let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
Expand All @@ -612,7 +625,6 @@ mod tests {
value["tables"][0]["markdown"],
"| Name | Score |\n| --- | --- |\n| Alpha | 10 |\n| Beta | 12 |\n"
);
assert!(doc.payload.tables.is_empty());
}

#[test]
Expand Down
117 changes: 117 additions & 0 deletions schemas/table_model_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
#
# Copyright 2026 The Ethos maintainers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import annotations

from typing import Any


def diagnose_table_model(payload: dict[str, Any], label: str = "document.example.json") -> list[str]:
"""Return deterministic diagnostics for table/cell invariants JSON Schema cannot express."""
diagnostics: list[str] = []
for table in payload.get("tables", []):
table_id = table.get("id", "<unknown>")
n_rows = table.get("n_rows")
n_cols = table.get("n_cols")
header_rows = table.get("header_rows", 0)
header_cols = table.get("header_cols", 0)

if not isinstance(n_rows, int) or not isinstance(n_cols, int):
continue
if not isinstance(header_rows, int) or not isinstance(header_cols, int):
continue

if header_rows > n_rows:
diagnostics.append(
f"{label}: table {table_id} header_rows {header_rows} exceeds n_rows {n_rows}"
)
if header_cols > n_cols:
diagnostics.append(
f"{label}: table {table_id} header_cols {header_cols} exceeds n_cols {n_cols}"
)

coverage: dict[tuple[int, int], int] = {}
for cell_index, cell in enumerate(table.get("cells", [])):
row = cell.get("row")
col = cell.get("col")
row_span = cell.get("row_span")
col_span = cell.get("col_span")
if not all(isinstance(v, int) for v in [row, col, row_span, col_span]):
continue

context = f"table {table_id} cell[{cell_index}]"
if row >= n_rows:
diagnostics.append(
f"{label}: {context} row {row} is outside n_rows {n_rows}"
)
continue
if col >= n_cols:
diagnostics.append(
f"{label}: {context} col {col} is outside n_cols {n_cols}"
)
continue
if row + row_span > n_rows:
diagnostics.append(
f"{label}: {context} row_span {row_span} from row {row} exceeds n_rows {n_rows}"
)
continue
if col + col_span > n_cols:
diagnostics.append(
f"{label}: {context} col_span {col_span} from col {col} exceeds n_cols {n_cols}"
)
continue

if not bbox_contains(table.get("bbox"), cell.get("bbox")):
diagnostics.append(f"{label}: {context} bbox is outside table bbox")

for r in range(row, row + row_span):
for c in range(col, col + col_span):
covered_by = coverage.get((r, c))
if covered_by is not None:
diagnostics.append(
f"{label}: {context} overlaps covered slot ({r},{c}) "
f"already covered by cell[{covered_by}]"
)
else:
coverage[(r, c)] = cell_index

for r in range(n_rows):
for c in range(n_cols):
if (r, c) not in coverage:
diagnostics.append(
f"{label}: table {table_id} missing cell coverage at ({r},{c})"
)
return diagnostics


def bbox_contains(outer: Any, inner: Any) -> bool:
if not (is_bbox(outer) and is_bbox(inner)):
return True
return (
inner[0] >= outer[0]
and inner[1] >= outer[1]
and inner[2] <= outer[2]
and inner[3] <= outer[3]
)


def is_bbox(value: Any) -> bool:
return (
isinstance(value, list)
and len(value) == 4
and all(isinstance(coord, int) for coord in value)
)
143 changes: 143 additions & 0 deletions schemas/test_table_model_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3
#
# Copyright 2026 The Ethos maintainers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import annotations

import copy
import json
from pathlib import Path
import unittest

from table_model_validation import diagnose_table_model

ROOT = Path(__file__).resolve().parents[1]
DOCUMENT_EXAMPLE = ROOT / "schemas" / "examples" / "document.example.json"


def valid_payload() -> dict:
return copy.deepcopy(json.loads(DOCUMENT_EXAMPLE.read_text(encoding="utf-8"))["payload"])


def cell(row: int, col: int, bbox: list[int], row_span: int = 1, col_span: int = 1) -> dict:
return {
"row": row,
"col": col,
"row_span": row_span,
"col_span": col_span,
"bbox": bbox,
"text": f"{row},{col}",
}


class TableModelValidationTests(unittest.TestCase):
def test_valid_complete_grid_has_no_diagnostics(self) -> None:
self.assertEqual(diagnose_table_model(valid_payload(), "fixture"), [])

def test_header_rows_must_fit_table(self) -> None:
payload = valid_payload()
payload["tables"][0]["header_rows"] = 3

self.assertEqual(
diagnose_table_model(payload, "fixture")[:1],
["fixture: table t0001 header_rows 3 exceeds n_rows 2"],
)

def test_header_cols_must_fit_table(self) -> None:
payload = valid_payload()
payload["tables"][0]["header_cols"] = 3

self.assertEqual(
diagnose_table_model(payload, "fixture")[:1],
["fixture: table t0001 header_cols 3 exceeds n_cols 2"],
)

def test_cell_row_must_fit_table_dimensions(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][0]["row"] = 2

self.assertIn(
"fixture: table t0001 cell[0] row 2 is outside n_rows 2",
diagnose_table_model(payload, "fixture"),
)

def test_cell_col_must_fit_table_dimensions(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][0]["col"] = 2

self.assertIn(
"fixture: table t0001 cell[0] col 2 is outside n_cols 2",
diagnose_table_model(payload, "fixture"),
)

def test_cell_row_span_must_fit_table_dimensions(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][2]["row_span"] = 2

self.assertIn(
"fixture: table t0001 cell[2] row_span 2 from row 1 exceeds n_rows 2",
diagnose_table_model(payload, "fixture"),
)

def test_cell_col_span_must_fit_table_dimensions(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][3]["col_span"] = 2

self.assertIn(
"fixture: table t0001 cell[3] col_span 2 from col 1 exceeds n_cols 2",
diagnose_table_model(payload, "fixture"),
)

def test_cell_bbox_must_stay_inside_table_bbox(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][0]["bbox"] = [-1, 0, 100, 50]

self.assertIn(
"fixture: table t0001 cell[0] bbox is outside table bbox",
diagnose_table_model(payload, "fixture"),
)

def test_overlapping_cell_coverage_fails_closed(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"][1]["col"] = 0

self.assertIn(
"fixture: table t0001 cell[1] overlaps covered slot (0,0) already covered by cell[0]",
diagnose_table_model(payload, "fixture"),
)

def test_missing_cell_coverage_fails_closed(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"] = payload["tables"][0]["cells"][:-1]

self.assertIn(
"fixture: table t0001 missing cell coverage at (1,1)",
diagnose_table_model(payload, "fixture"),
)

def test_spanned_cells_can_cover_multiple_grid_slots(self) -> None:
payload = valid_payload()
payload["tables"][0]["cells"] = [
cell(0, 0, [7200, 13000, 54000, 16500], col_span=2),
cell(1, 0, [7200, 16500, 30600, 20000]),
cell(1, 1, [30600, 16500, 54000, 20000]),
]

self.assertEqual(diagnose_table_model(copy.deepcopy(payload), "fixture"), [])


if __name__ == "__main__":
unittest.main()
11 changes: 10 additions & 1 deletion schemas/validate_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pathlib import Path

from font_policy_validation import diagnose_font_policy
from table_model_validation import diagnose_table_model

try:
from jsonschema import Draft202012Validator as Validator
Expand Down Expand Up @@ -151,9 +152,14 @@ def check_ref(kind: str, ref: str, ctx: str) -> None:
for t in p["tables"]:
for r in t["page_refs"]:
check_ref("page", r, f"table {t['id']}")
for r in t.get("warning_refs", []):
check_ref("warning", r, f"table {t['id']}")
for c in t["cells"]:
cell_ctx = f"table {t['id']} cell ({c['row']},{c['col']})"
for r in c.get("span_refs", []):
check_ref("span", r, cell_ctx)
for r in c.get("element_refs", []):
check_ref("element", r, f"table {t['id']} cell ({c['row']},{c['col']})")
check_ref("element", r, cell_ctx)
for ch in p["chunks"]:
for r in ch["element_refs"]:
check_ref("element", r, f"chunk {ch['id']}")
Expand All @@ -171,6 +177,9 @@ def check_ref(kind: str, ref: str, ctx: str) -> None:
if "region_ref" in w:
check_ref("region", w["region_ref"], f"warning {w['id']}")

for diagnostic in diagnose_table_model(p):
fail(diagnostic)

# --- derivability: chunks.example.jsonl must be EXACTLY what `ethos rag chunk` derives
# from document.example.json (PRD §7: every derived artifact is reproducible from the
# canonical JSON + config). Catches drift between examples (P2 reviewer finding).
Expand Down
Loading