From baf4bcc5cd2f38be4ce473606bb86438b4c9f831 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 13:48:35 +0530 Subject: [PATCH] Validate region fixture references Signed-off-by: docushell-admin --- fixtures/test_validate_fixtures.py | 202 +++++++++++++++++++++++++++++ fixtures/validate_fixtures.py | 136 +++++++++++++++++++ 2 files changed, 338 insertions(+) diff --git a/fixtures/test_validate_fixtures.py b/fixtures/test_validate_fixtures.py index ab3ceca..99d8747 100644 --- a/fixtures/test_validate_fixtures.py +++ b/fixtures/test_validate_fixtures.py @@ -208,5 +208,207 @@ def cited_cell() -> dict[str, list[str]]: return {"span_refs": ["s000001"], "element_refs": ["e000001"]} +class FixtureValidatorRegionRefTests(unittest.TestCase): + def setUp(self) -> None: + VALIDATOR.failures = 0 + + def tearDown(self) -> None: + VALIDATOR.failures = 0 + + def test_region_refs_reject_unknown_pages(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p9999", + "bbox": [10, 20, 30, 40], + "kind": "unknown", + } + ], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json regions[0] references unknown page 'p9999'", + output, + ) + + def test_region_refs_reject_malformed_bbox(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [30, 20, 10, 40], + "kind": "unknown", + } + ], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json regions[0].bbox must satisfy x0<=x1 and y0<=y1", + output, + ) + + def test_region_refs_reject_non_four_integer_bbox(self) -> None: + cases = [ + [10, 20, 30], + [10, 20, 30, 40.5], + [10, True, 30, 40], + ] + for bbox in cases: + with self.subTest(bbox=bbox): + VALIDATOR.failures = 0 + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": bbox, + "kind": "unknown", + } + ], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json regions[0].bbox must be a four-integer array", + output, + ) + + def test_region_refs_reject_bbox_outside_page_bounds(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [10, 20, 1001, 40], + "kind": "unknown", + } + ], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json regions[0].bbox must stay within page bounds", + output, + ) + + def test_region_refs_reject_unknown_warning_refs(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [10, 20, 30, 40], + "kind": "unknown", + "warning_refs": ["w9999"], + } + ], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json regions[0] references unknown warning 'w9999'", + output, + ) + + def test_warning_refs_reject_unknown_region_refs(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [10, 20, 30, 40], + "kind": "unknown", + } + ], + [{"id": "w0001", "region_ref": "r9999"}], + ) + + self.assertEqual(failures, 1) + self.assertIn( + "extraction.json warnings[0] references unknown region 'r9999'", + output, + ) + + def test_region_refs_reject_duplicate_region_ids(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [10, 20, 30, 40], + "kind": "unknown", + }, + { + "id": "r0001", + "page": "p0001", + "bbox": [50, 60, 70, 80], + "kind": "unknown", + }, + ], + ) + + self.assertEqual(failures, 1) + self.assertIn("extraction.json regions[1].id duplicates 'r0001'", output) + + def test_layout_element_region_refs_reject_unknown_regions(self) -> None: + failures, output = self.validate_layout_region_refs( + {"elements": [{"id": "e000001", "region_ref": "r9999"}], "warnings": []}, + ) + + self.assertEqual(failures, 1) + self.assertIn("layout.json elements[0] references unknown region 'r9999'", output) + + def test_layout_warning_region_refs_reject_unknown_regions(self) -> None: + failures, output = self.validate_layout_region_refs( + {"elements": [], "warnings": [{"id": "w0001", "region_ref": "r9999"}]}, + ) + + self.assertEqual(failures, 1) + self.assertIn("layout.json warnings[0] references unknown region 'r9999'", output) + + def test_valid_region_refs_are_accepted(self) -> None: + failures, output = self.validate_regions( + [ + { + "id": "r0001", + "page": "p0001", + "bbox": [10, 20, 30, 40], + "kind": "unknown", + "warning_refs": ["w0001"], + } + ], + [{"id": "w0001", "region_ref": "r0001"}], + ) + + self.assertEqual(failures, 0) + self.assertEqual(output, "") + + def validate_regions(self, regions, warnings=None) -> tuple[int, str]: + extraction = { + "pages": [{"id": "p0001", "width": 1000, "height": 1000}], + "regions": regions, + "warnings": warnings or [{"id": "w0001"}], + } + output = io.StringIO() + with contextlib.redirect_stdout(output): + VALIDATOR.validate_extraction_region_refs("extraction.json", extraction) + return VALIDATOR.failures, output.getvalue() + + def validate_layout_region_refs(self, layout) -> tuple[int, str]: + extraction = { + "regions": [ + {"id": "r0001", "page": "p0001", "bbox": [10, 20, 30, 40]} + ], + } + output = io.StringIO() + with contextlib.redirect_stdout(output): + VALIDATOR.validate_layout_region_refs("layout.json", layout, extraction) + return VALIDATOR.failures, output.getvalue() + + if __name__ == "__main__": unittest.main() diff --git a/fixtures/validate_fixtures.py b/fixtures/validate_fixtures.py index ef386e0..a290f60 100644 --- a/fixtures/validate_fixtures.py +++ b/fixtures/validate_fixtures.py @@ -192,6 +192,8 @@ def validate_golden_file(path: Path, stage: str, keys: set[str]): fail(f"{ctx} {key} must be an array") validate_projection_items(ctx, "pages", golden.get("pages"), required=True) validate_projection_items(ctx, "spans", golden.get("spans"), required=True) + validate_projection_items(ctx, "regions", golden.get("regions"), required=False) + validate_extraction_region_refs(ctx, golden) elif stage == "layout": if not isinstance(golden.get("elements"), list): fail(f"{ctx} elements must be an array") @@ -358,6 +360,135 @@ def validate_table_refs(ctx: str, tables, extraction, layout) -> None: fail(f"{cell_ctx} in table {table_id} must cite span_refs or element_refs") +def validate_extraction_region_refs(ctx: str, extraction) -> None: + if not isinstance(extraction, dict): + return + pages = extraction.get("pages") if isinstance(extraction.get("pages"), list) else [] + regions = ( + extraction.get("regions") if isinstance(extraction.get("regions"), list) else [] + ) + warnings = ( + extraction.get("warnings") if isinstance(extraction.get("warnings"), list) else [] + ) + + page_dims = {} + for page in pages: + if not isinstance(page, dict) or not isinstance(page.get("id"), str): + continue + width = page.get("width") + height = page.get("height") + if ( + isinstance(width, int) + and not isinstance(width, bool) + and isinstance(height, int) + and not isinstance(height, bool) + ): + page_dims[page["id"]] = (width, height) + + region_ids = set() + for region_index, region in enumerate(regions): + region_ctx = f"{ctx} regions[{region_index}]" + if not isinstance(region, dict): + fail(f"{region_ctx} must be an object") + continue + region_id = region.get("id") + if isinstance(region_id, str) and region_id: + if region_id in region_ids: + fail(f"{region_ctx}.id duplicates '{region_id}'") + region_ids.add(region_id) + + warning_ids = { + warning.get("id") + for warning in warnings + if isinstance(warning, dict) and isinstance(warning.get("id"), str) + } + + for region_index, region in enumerate(regions): + region_ctx = f"{ctx} regions[{region_index}]" + if not isinstance(region, dict): + continue + page = region.get("page") + if not isinstance(page, str) or not page: + fail(f"{region_ctx}.page must be a non-empty string") + page_dims_for_region = None + elif page not in page_dims: + fail(f"{region_ctx} references unknown page '{page}'") + page_dims_for_region = None + else: + page_dims_for_region = page_dims[page] + + validate_bbox(region.get("bbox"), region_ctx, page_dims_for_region) + for ref in string_ref_array( + region.get("warning_refs", []), f"{region_ctx}.warning_refs" + ): + if ref not in warning_ids: + fail(f"{region_ctx} references unknown warning '{ref}'") + + for warning_index, warning in enumerate(warnings): + warning_ctx = f"{ctx} warnings[{warning_index}]" + if not isinstance(warning, dict) or "region_ref" not in warning: + continue + region_ref = warning.get("region_ref") + if not isinstance(region_ref, str) or not region_ref: + fail(f"{warning_ctx}.region_ref must be a non-empty string") + elif region_ref not in region_ids: + fail(f"{warning_ctx} references unknown region '{region_ref}'") + + +def validate_layout_region_refs(ctx: str, layout, extraction) -> None: + if not isinstance(layout, dict) or not isinstance(extraction, dict): + return + regions = ( + extraction.get("regions") if isinstance(extraction.get("regions"), list) else [] + ) + region_ids = { + region.get("id") + for region in regions + if isinstance(region, dict) and isinstance(region.get("id"), str) + } + elements = layout.get("elements") if isinstance(layout.get("elements"), list) else [] + warnings = layout.get("warnings") if isinstance(layout.get("warnings"), list) else [] + + for element_index, element in enumerate(elements): + element_ctx = f"{ctx} elements[{element_index}]" + if not isinstance(element, dict) or "region_ref" not in element: + continue + region_ref = element.get("region_ref") + if not isinstance(region_ref, str) or not region_ref: + fail(f"{element_ctx}.region_ref must be a non-empty string") + elif region_ref not in region_ids: + fail(f"{element_ctx} references unknown region '{region_ref}'") + + for warning_index, warning in enumerate(warnings): + warning_ctx = f"{ctx} warnings[{warning_index}]" + if not isinstance(warning, dict) or "region_ref" not in warning: + continue + region_ref = warning.get("region_ref") + if not isinstance(region_ref, str) or not region_ref: + fail(f"{warning_ctx}.region_ref must be a non-empty string") + elif region_ref not in region_ids: + fail(f"{warning_ctx} references unknown region '{region_ref}'") + + +def validate_bbox(value, ctx: str, page_dims=None) -> None: + if ( + not isinstance(value, list) + or len(value) != 4 + or any(not isinstance(item, int) or isinstance(item, bool) for item in value) + ): + fail(f"{ctx}.bbox must be a four-integer array") + return + x0, y0, x1, y1 = value + if x0 > x1 or y0 > y1: + fail(f"{ctx}.bbox must satisfy x0<=x1 and y0<=y1") + return + if page_dims is None: + return + width, height = page_dims + if x0 < 0 or y0 < 0 or x1 > width or y1 > height: + fail(f"{ctx}.bbox must stay within page bounds") + + def string_ref_array(value, ctx: str) -> list[str]: if not isinstance(value, list): fail(f"{ctx} must be an array") @@ -853,6 +984,11 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou extraction_golden, layout_golden, ) + validate_layout_region_refs( + str((fixture_dir / "layout.json").relative_to(ROOT)), + layout_golden, + extraction_golden, + ) validate_table_goldens( fixture_dir, metadata,