From ec416b3adf3a14789e92b64c371496288cc0c207 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:18:54 +0530 Subject: [PATCH 01/51] Ground security report span previews Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 38 +++++++++++ schemas/test_security_report_validation.py | 76 ++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 5473686..367a576 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -36,6 +36,8 @@ "low_contrast_text_detected", } +TEXT_BACKED_FINDING_CODES = DEFAULT_CHUNK_EXCLUDED_CODES + def diagnose_security_report_example( document, @@ -204,6 +206,7 @@ def diagnose_findings_references(findings, refs, ctx, diagnostics): ) if "bbox" in finding: check_bbox(finding.get("bbox"), page, refs, ctx, item_ctx, diagnostics) + check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics) def diagnose_inventory_references(inventory_lists, refs, ctx, diagnostics): @@ -269,6 +272,41 @@ def check_bbox(bbox, page, refs, ctx, item_ctx, diagnostics): diagnostics.append(f"{ctx}: {item_ctx} bbox exceeds page {page} bounds") +def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): + if finding.get("code") not in TEXT_BACKED_FINDING_CODES: + return + span_ref = finding.get("span_ref") + if span_ref is None: + return + span = refs["spans"].get(span_ref) + if not isinstance(span, dict): + return + + if "bbox" not in finding: + diagnostics.append(f"{ctx}: {item_ctx} span_ref {span_ref} requires bbox") + elif finding.get("bbox") != span.get("bbox"): + diagnostics.append(f"{ctx}: {item_ctx} bbox must match span_ref {span_ref} bbox") + + span_text = span.get("text") + if not isinstance(span_text, str): + return + expected_preview = deterministic_preview(span_text) + if "text_preview" not in finding: + diagnostics.append( + f"{ctx}: {item_ctx} span_ref {span_ref} requires text_preview" + ) + elif finding.get("text_preview") != expected_preview: + diagnostics.append( + f"{ctx}: {item_ctx} text_preview must match span_ref {span_ref} text" + ) + + +def deterministic_preview(text): + if len(text) <= 120: + return text + return text[:120] + "\u2026" + + def finding_ctx(finding, index): finding_id = finding.get("id") if isinstance(finding_id, str): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index c5370e7..14d1ac1 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -192,6 +192,82 @@ def test_finding_bbox_must_stay_inside_page_bounds(self) -> None: diagnostics, ) + def test_text_backed_finding_requires_span_bbox(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0].pop("bbox") + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 span_ref s000003 requires bbox", + diagnostics, + ) + + def test_text_backed_finding_bbox_must_match_span_bbox(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["bbox"][0] = 101 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 bbox must match span_ref " + "s000003 bbox", + diagnostics, + ) + + def test_text_backed_finding_requires_text_preview(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0].pop("text_preview") + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 span_ref s000003 requires text_preview", + diagnostics, + ) + + def test_text_backed_finding_preview_must_match_span_text(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["text_preview"] = "internal-draft" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 text_preview must match " + "span_ref s000003 text", + diagnostics, + ) + + def test_text_backed_finding_preview_uses_deterministic_truncation(self) -> None: + document = copy.deepcopy(self.document) + span_text = "x" * 121 + document["payload"]["spans"][2]["text"] = span_text + report = copy.deepcopy(self.report) + report["findings"][0]["text_preview"] = "x" * 120 + + diagnostics = diagnose_security_report_example(document, report) + + self.assertIn( + "security-report.example.json: finding f0001 text_preview must match " + "span_ref s000003 text", + diagnostics, + ) + + def test_text_backed_finding_accepts_deterministic_truncated_preview(self) -> None: + document = copy.deepcopy(self.document) + span_text = "x" * 121 + document["payload"]["spans"][2]["text"] = span_text + report = copy.deepcopy(self.report) + report["findings"][0]["text_preview"] = ("x" * 120) + "\u2026" + + diagnostics = diagnose_security_report_example(document, report) + + self.assertNotIn( + "security-report.example.json: finding f0001 text_preview must match " + "span_ref s000003 text", + diagnostics, + ) + def test_annotations_inventory_requires_matching_finding(self) -> None: report = copy.deepcopy(self.report) report["findings"] = [ From 103e7220925a1f15572b535cf5b8ba4952a526ee Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:21:36 +0530 Subject: [PATCH 02/51] Validate security report artifact identity Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 30 +++++++++++++ schemas/test_security_report_validation.py | 49 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 367a576..b832a11 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -51,6 +51,7 @@ def diagnose_security_report_example( warnings.extend(payload.get("security_warnings", [])) warnings.extend(payload.get("parser_warnings", [])) refs = document_reference_index(payload) + diagnose_report_identity(document, report, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] if not isinstance(findings, list): @@ -150,6 +151,35 @@ def projected_warning_finding(warning): return projected +def diagnose_report_identity(document, report, ctx, diagnostics): + if not isinstance(document, dict) or not isinstance(report, dict): + return + expected = { + "schema_version": document.get("schema_version"), + "document_fingerprint": document.get("fingerprint"), + "source_fingerprint": nested_get(document, "source", "fingerprint"), + "profile.id": nested_get(document, "profile", "id"), + "profile.sha256": nested_get(document, "profile", "sha256"), + } + actual = { + "schema_version": report.get("schema_version"), + "document_fingerprint": report.get("document_fingerprint"), + "source_fingerprint": report.get("source_fingerprint"), + "profile.id": nested_get(report, "profile", "id"), + "profile.sha256": nested_get(report, "profile", "sha256"), + } + for key, want in expected.items(): + if want is not None and actual.get(key) != want: + diagnostics.append(f"{ctx}: {key} diverges from document example") + + +def nested_get(value, outer_key, inner_key): + outer = value.get(outer_key) if isinstance(value, dict) else None + if not isinstance(outer, dict): + return None + return outer.get(inner_key) + + def project_report_finding(finding): projected = { "code": finding.get("code"), diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 14d1ac1..d5cd089 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -37,6 +37,55 @@ def setUp(self) -> None: def test_current_examples_are_coherent(self) -> None: self.assertEqual(diagnose_security_report_example(self.document, self.report), []) + def test_schema_version_must_match_document(self) -> None: + report = copy.deepcopy(self.report) + report["schema_version"] = "1.0.1" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: schema_version diverges from document example", + diagnostics, + ) + + def test_document_fingerprint_must_match_document(self) -> None: + report = copy.deepcopy(self.report) + report["document_fingerprint"] = "sha256:" + ("0" * 64) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: document_fingerprint diverges from document example", + diagnostics, + ) + + def test_source_fingerprint_must_match_document_source(self) -> None: + report = copy.deepcopy(self.report) + report["source_fingerprint"] = "sha256:" + ("0" * 64) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: source_fingerprint diverges from document example", + diagnostics, + ) + + def test_profile_identity_must_match_document_profile(self) -> None: + report = copy.deepcopy(self.report) + report["profile"]["id"] = "ethos-deterministic-v2" + report["profile"]["sha256"] = "0" * 64 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: profile.id diverges from document example", + diagnostics, + ) + self.assertIn( + "security-report.example.json: profile.sha256 diverges from document example", + diagnostics, + ) + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: report = copy.deepcopy(self.report) report["summary"]["hidden_text_detected"] = 2 From b0104020e8257a32812514de540b276269d70ebc Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:30:58 +0530 Subject: [PATCH 03/51] Validate security report finding ids Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 18 +++++++++++++++++ schemas/test_security_report_validation.py | 23 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index b832a11..6704701 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -99,6 +99,7 @@ def diagnose_security_report_example( f"{ctx}: summary.{code} must be {expected_count} for report findings" ) + diagnose_finding_ids(findings, ctx, diagnostics) diagnose_findings_references(findings, refs, ctx, diagnostics) inventories = report.get("inventories") if isinstance(report, dict) else {} @@ -173,6 +174,23 @@ def diagnose_report_identity(document, report, ctx, diagnostics): diagnostics.append(f"{ctx}: {key} diverges from document example") +def diagnose_finding_ids(findings, ctx, diagnostics): + seen = set() + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + finding_id = finding.get("id") + expected_id = f"f{index + 1:04d}" + if finding_id != expected_id: + diagnostics.append( + f"{ctx}: findings[{index}].id must be {expected_id} for deterministic numbering" + ) + if isinstance(finding_id, str): + if finding_id in seen: + diagnostics.append(f"{ctx}: duplicate finding id {finding_id}") + seen.add(finding_id) + + def nested_get(value, outer_key, inner_key): outer = value.get(outer_key) if isinstance(value, dict) else None if not isinstance(outer, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d5cd089..08c3f3f 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -86,6 +86,29 @@ def test_profile_identity_must_match_document_profile(self) -> None: diagnostics, ) + def test_finding_ids_must_be_contiguous_in_report_order(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][1]["id"] = "f0004" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: findings[1].id must be f0002 " + "for deterministic numbering", + diagnostics, + ) + + def test_finding_ids_must_be_unique(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][1]["id"] = "f0001" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: duplicate finding id f0001", + diagnostics, + ) + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: report = copy.deepcopy(self.report) report["summary"]["hidden_text_detected"] = 2 From 2c634758505e948cf44011480ebde8567a660626 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:34:04 +0530 Subject: [PATCH 04/51] Validate security report finding messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 22 +++++++++++++ schemas/test_security_report_validation.py | 36 ++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 6704701..953c667 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -38,6 +38,12 @@ TEXT_BACKED_FINDING_CODES = DEFAULT_CHUNK_EXCLUDED_CODES +FINDING_MESSAGE_TEMPLATES = { + "hidden_text_detected": "hidden text detected: excluded from default chunks", + "annotations_present": "annotations present on page", + "external_links_present": "external links present on page", +} + def diagnose_security_report_example( document, @@ -100,6 +106,7 @@ def diagnose_security_report_example( ) diagnose_finding_ids(findings, ctx, diagnostics) + diagnose_finding_messages(findings, ctx, diagnostics) diagnose_findings_references(findings, refs, ctx, diagnostics) inventories = report.get("inventories") if isinstance(report, dict) else {} @@ -191,6 +198,21 @@ def diagnose_finding_ids(findings, ctx, diagnostics): seen.add(finding_id) +def diagnose_finding_messages(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + code = finding.get("code") + expected_message = FINDING_MESSAGE_TEMPLATES.get(code) + if expected_message is None: + continue + actual_message = finding.get("message") + if actual_message != expected_message: + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)} message must match fixed template for {code}" + ) + + def nested_get(value, outer_key, inner_key): outer = value.get(outer_key) if isinstance(value, dict) else None if not isinstance(outer, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 08c3f3f..6195217 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -109,6 +109,42 @@ def test_finding_ids_must_be_unique(self) -> None: diagnostics, ) + def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["message"] = "hidden text changed" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 message must match " + "fixed template for hidden_text_detected", + diagnostics, + ) + + def test_annotation_finding_message_must_match_fixed_template(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][1]["message"] = "annotations changed" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0002 message must match " + "fixed template for annotations_present", + diagnostics, + ) + + def test_external_link_finding_message_must_match_fixed_template(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][2]["message"] = "links changed" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0003 message must match " + "fixed template for external_links_present", + diagnostics, + ) + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: report = copy.deepcopy(self.report) report["summary"]["hidden_text_detected"] = 2 From fb741ed14699c9aa8a6ad098c37e3d7015250570 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:36:31 +0530 Subject: [PATCH 05/51] Validate security report exclusion flags Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 16 ++++++++++++++++ schemas/test_security_report_validation.py | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 953c667..97b2522 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -107,6 +107,7 @@ def diagnose_security_report_example( diagnose_finding_ids(findings, ctx, diagnostics) diagnose_finding_messages(findings, ctx, diagnostics) + diagnose_finding_exclusion_flags(findings, ctx, diagnostics) diagnose_findings_references(findings, refs, ctx, diagnostics) inventories = report.get("inventories") if isinstance(report, dict) else {} @@ -213,6 +214,21 @@ def diagnose_finding_messages(findings, ctx, diagnostics): ) +def diagnose_finding_exclusion_flags(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + code = finding.get("code") + if not isinstance(code, str): + continue + expected = code in DEFAULT_CHUNK_EXCLUDED_CODES + if finding.get("excluded_from_default_chunks") != expected: + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)} excluded_from_default_chunks " + f"must be {str(expected).lower()} for {code}" + ) + + def nested_get(value, outer_key, inner_key): outer = value.get(outer_key) if isinstance(value, dict) else None if not isinstance(outer, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 6195217..ca2ff88 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -222,6 +222,23 @@ def test_default_excluded_warning_codes_must_be_flagged(self) -> None: "security-report.example.json: missing warning-derived finding for hidden_text_detected", diagnostics, ) + self.assertIn( + "security-report.example.json: finding f0001 excluded_from_default_chunks " + "must be true for hidden_text_detected", + diagnostics, + ) + + def test_non_exclusion_finding_codes_must_not_be_default_excluded(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][1]["excluded_from_default_chunks"] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0002 excluded_from_default_chunks " + "must be false for annotations_present", + diagnostics, + ) def test_finding_page_refs_must_exist_in_document(self) -> None: report = copy.deepcopy(self.report) From 78cbfeef658c9549d3d9df11ac6abc1bb45626f6 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 15:40:06 +0530 Subject: [PATCH 06/51] Validate security report span ownership Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 24 ++++++++++++++ schemas/test_security_report_validation.py | 38 ++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 97b2522..af98de8 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -322,6 +322,8 @@ def check_locator_ref(item, key, ref_kind, refs, ctx, item_ctx, diagnostics): diagnostics.append( f"{ctx}: {item_ctx} {key} {ref} page {target_page} does not match page {page}" ) + if key == "span_ref": + check_element_span_ownership(item, refs, ctx, item_ctx, ref, diagnostics) def check_page_ref(page, refs, ctx, item_ctx, diagnostics): @@ -363,6 +365,9 @@ def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): return span_ref = finding.get("span_ref") if span_ref is None: + diagnostics.append( + f"{ctx}: {item_ctx} requires span_ref for {finding.get('code')}" + ) return span = refs["spans"].get(span_ref) if not isinstance(span, dict): @@ -393,6 +398,25 @@ def deterministic_preview(text): return text[:120] + "\u2026" +def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): + element_ref = item.get("element_ref") + if element_ref is None: + return + element = refs["elements"].get(element_ref) + if not isinstance(element, dict): + return + span_refs = element.get("span_refs", []) + if not isinstance(span_refs, list): + diagnostics.append( + f"{ctx}: {item_ctx} element_ref {element_ref} span_refs must be an array" + ) + return + if span_ref not in span_refs: + diagnostics.append( + f"{ctx}: {item_ctx} span_ref {span_ref} is not owned by element_ref {element_ref}" + ) + + def finding_ctx(finding, index): finding_id = finding.get("id") if isinstance(finding_id, str): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index ca2ff88..5de2a5f 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -284,6 +284,44 @@ def test_finding_span_refs_must_match_finding_page(self) -> None: diagnostics, ) + def test_text_backed_finding_requires_span_ref(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0].pop("span_ref") + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 requires span_ref for " + "hidden_text_detected", + diagnostics, + ) + + def test_finding_span_ref_must_be_owned_by_element_ref_when_both_present(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["element_ref"] = "e000001" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 span_ref s000003 is not " + "owned by element_ref e000001", + diagnostics, + ) + + def test_finding_element_span_refs_must_be_deterministic_array(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["elements"][0]["span_refs"] = "s000001" + report = copy.deepcopy(self.report) + report["findings"][0]["element_ref"] = "e000001" + + diagnostics = diagnose_security_report_example(document, report) + + self.assertIn( + "security-report.example.json: finding f0001 element_ref e000001 " + "span_refs must be an array", + diagnostics, + ) + def test_finding_bbox_must_have_page(self) -> None: report = copy.deepcopy(self.report) report["findings"][0].pop("page") From b92622423e6fd2717d581b8d0677b596f693e11f Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 16:03:33 +0530 Subject: [PATCH 07/51] Add source-only security report command Signed-off-by: docushell-admin --- .github/scripts/test_security_report_alpha.py | 4 +- Makefile | 1 + crates/ethos-cli/src/cmd/mod.rs | 1 + crates/ethos-cli/src/cmd/security.rs | 291 +++++++++++++++++ crates/ethos-cli/src/main.rs | 23 ++ crates/ethos-cli/tests/security_report.rs | 296 ++++++++++++++++++ 6 files changed, 615 insertions(+), 1 deletion(-) create mode 100644 crates/ethos-cli/src/cmd/security.rs create mode 100644 crates/ethos-cli/tests/security_report.rs diff --git a/.github/scripts/test_security_report_alpha.py b/.github/scripts/test_security_report_alpha.py index f3e32d8..1cbfaf4 100644 --- a/.github/scripts/test_security_report_alpha.py +++ b/.github/scripts/test_security_report_alpha.py @@ -58,6 +58,7 @@ def test_target_composes_security_report_artifact_gates(self) -> None: block = target_block("security-report-alpha") required = [ + "cargo test --locked -p ethos-cli --test security_report", "$(PYTHON) schemas/validate_examples.py", "$(PYTHON) schemas/test_security_report_validation.py", "$(PYTHON) .github/scripts/test_security_report_alpha.py", @@ -69,7 +70,8 @@ def test_target_composes_security_report_artifact_gates(self) -> None: def test_target_stays_security_report_scoped(self) -> None: block = target_block("security-report-alpha") - self.assertNotIn("cargo test", block) + self.assertNotIn("cargo test --locked -p ethos-cli --test rag", block) + self.assertNotIn("cargo test --locked -p ethos-cli --test verify", block) self.assertNotIn("rag-chunk-alpha", block) self.assertNotIn("layout-evaluator-alpha", block) self.assertNotIn("python-surface-test", block) diff --git a/Makefile b/Makefile index ca2f241..d3536c9 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,7 @@ rag-chunk-alpha: git diff --check security-report-alpha: + cargo test --locked -p ethos-cli --test security_report $(PYTHON) schemas/validate_examples.py $(PYTHON) schemas/test_security_report_validation.py $(PYTHON) .github/scripts/test_security_report_alpha.py diff --git a/crates/ethos-cli/src/cmd/mod.rs b/crates/ethos-cli/src/cmd/mod.rs index 3784696..d257fba 100644 --- a/crates/ethos-cli/src/cmd/mod.rs +++ b/crates/ethos-cli/src/cmd/mod.rs @@ -16,4 +16,5 @@ pub(crate) mod doc; pub(crate) mod rag; +pub(crate) mod security; pub(crate) mod verify; diff --git a/crates/ethos-cli/src/cmd/security.rs b/crates/ethos-cli/src/cmd/security.rs new file mode 100644 index 0000000..1baf45e --- /dev/null +++ b/crates/ethos-cli/src/cmd/security.rs @@ -0,0 +1,291 @@ +/* + * Copyright 2026 The Ethos maintainers + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::cmp::Ordering; +use std::collections::BTreeMap; + +use ethos_core::codes::WarningCode; +use ethos_core::error::EthosError; +use ethos_core::model::{Document, Element, Page, Span, Warning}; + +use crate::{read_document, write_output, Failure, SecurityReportArgs}; + +pub(crate) fn security_report(args: SecurityReportArgs) -> Result<(), Failure> { + let doc = read_document(&args.input)?; + let out = security_report_output_bytes(&doc)?; + write_output(args.out, &out) +} + +fn security_report_output_bytes(doc: &Document) -> Result, Failure> { + let refs = SecurityReportRefs::new(doc); + let mut warnings = Vec::with_capacity(doc.payload.security_warnings.len()); + for warning in &doc.payload.security_warnings { + if !warning.code.is_security() { + return Err(Failure::Usage(format!( + "security report warning {} ({}) is not a security warning code", + warning.id, + warning.code.as_str() + ))); + } + warnings.push(warning); + } + warnings.sort_by(|left, right| warning_order(left, right)); + + let mut summary: BTreeMap = BTreeMap::new(); + let mut findings = Vec::with_capacity(warnings.len()); + for (index, warning) in warnings.iter().enumerate() { + block_inventory_backed_warning(warning)?; + *summary + .entry(warning.code.as_str().to_string()) + .or_insert(0) += 1; + findings.push(finding_record(index, warning, &refs)?); + } + + let value = serde_json::json!({ + "schema_version": doc.schema_version.as_str(), + "document_fingerprint": doc.fingerprint.as_str(), + "source_fingerprint": doc.source.fingerprint.as_str(), + "profile": { + "id": doc.profile.id.as_str(), + "sha256": doc.profile.sha256.as_str(), + }, + "summary": summary, + "findings": findings, + "inventories": { + "annotations": [], + "actions": [], + "attachments": [], + "scripts": [], + "links": [], + }, + }); + let mut bytes = + ethos_core::c14n::c14n_bytes(&value).map_err(|e| EthosError::internal(e.message))?; + bytes.push(b'\n'); + Ok(bytes) +} + +struct SecurityReportRefs<'a> { + pages: BTreeMap<&'a str, &'a Page>, + elements: BTreeMap<&'a str, &'a Element>, + spans: BTreeMap<&'a str, &'a Span>, +} + +impl<'a> SecurityReportRefs<'a> { + fn new(doc: &'a Document) -> Self { + Self { + pages: doc + .payload + .pages + .iter() + .map(|page| (page.id.as_str(), page)) + .collect(), + elements: doc + .payload + .elements + .iter() + .map(|element| (element.id.as_str(), element)) + .collect(), + spans: doc + .payload + .spans + .iter() + .map(|span| (span.id.as_str(), span)) + .collect(), + } + } +} + +fn inventory_backed_warning_code(code: WarningCode) -> bool { + matches!( + code, + WarningCode::AnnotationsPresent + | WarningCode::ExternalLinksPresent + | WarningCode::UnsupportedAnnotation + ) +} + +fn text_backed_warning_code(code: WarningCode) -> bool { + matches!( + code, + WarningCode::HiddenTextDetected + | WarningCode::OffPageTextDetected + | WarningCode::LowContrastTextDetected + ) +} + +fn excludes_from_default_chunks(code: WarningCode) -> bool { + text_backed_warning_code(code) +} + +fn warning_order(left: &Warning, right: &Warning) -> Ordering { + ( + left.code.as_str(), + left.page.as_deref().unwrap_or(""), + left.element_ref.as_deref().unwrap_or(""), + left.span_ref.as_deref().unwrap_or(""), + left.region_ref.as_deref().unwrap_or(""), + left.message.as_str(), + ) + .cmp(&( + right.code.as_str(), + right.page.as_deref().unwrap_or(""), + right.element_ref.as_deref().unwrap_or(""), + right.span_ref.as_deref().unwrap_or(""), + right.region_ref.as_deref().unwrap_or(""), + right.message.as_str(), + )) +} + +fn block_inventory_backed_warning(warning: &Warning) -> Result<(), Failure> { + if inventory_backed_warning_code(warning.code) { + return Err(Failure::Usage(format!( + "security report warning {} ({}) requires inventory data not available in canonical document", + warning.id, + warning.code.as_str() + ))); + } + Ok(()) +} + +fn finding_record( + index: usize, + warning: &Warning, + refs: &SecurityReportRefs<'_>, +) -> Result { + validate_warning_refs(warning, refs)?; + let mut finding = serde_json::Map::new(); + finding.insert( + "id".to_string(), + serde_json::Value::String(format!("f{:04}", index + 1)), + ); + finding.insert( + "code".to_string(), + serde_json::Value::String(warning.code.as_str().to_string()), + ); + finding.insert( + "message".to_string(), + serde_json::Value::String(warning.message.clone()), + ); + if let Some(page) = &warning.page { + finding.insert("page".to_string(), serde_json::Value::String(page.clone())); + } + if let Some(element_ref) = &warning.element_ref { + finding.insert( + "element_ref".to_string(), + serde_json::Value::String(element_ref.clone()), + ); + } + if let Some(span_ref) = &warning.span_ref { + finding.insert( + "span_ref".to_string(), + serde_json::Value::String(span_ref.clone()), + ); + } + if text_backed_warning_code(warning.code) { + let span_ref = warning + .span_ref + .as_deref() + .expect("text warning refs validated"); + let span = refs + .spans + .get(span_ref) + .expect("text warning span_ref validated"); + finding.insert("bbox".to_string(), serde_json::json!(span.bbox.to_array())); + finding.insert( + "text_preview".to_string(), + serde_json::Value::String(deterministic_preview(&span.text)), + ); + } + finding.insert( + "excluded_from_default_chunks".to_string(), + serde_json::Value::Bool(excludes_from_default_chunks(warning.code)), + ); + Ok(serde_json::Value::Object(finding)) +} + +fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Result<(), Failure> { + if let Some(page) = warning.page.as_deref() { + if !refs.pages.contains_key(page) { + return Err(Failure::Usage(format!( + "security report warning {} references unknown page {}", + warning.id, page + ))); + } + } + let element = match warning.element_ref.as_deref() { + Some(element_ref) => { + let Some(element) = refs.elements.get(element_ref) else { + return Err(Failure::Usage(format!( + "security report warning {} references unknown element_ref {}", + warning.id, element_ref + ))); + }; + if let Some(page) = warning.page.as_deref() { + if element.page != page { + return Err(Failure::Usage(format!( + "security report warning {} element_ref {} page {} does not match page {}", + warning.id, element_ref, element.page, page + ))); + } + } + Some(*element) + } + None => None, + }; + if text_backed_warning_code(warning.code) && warning.span_ref.is_none() { + return Err(Failure::Usage(format!( + "security report warning {} ({}) requires span_ref", + warning.id, + warning.code.as_str() + ))); + } + if let Some(span_ref) = warning.span_ref.as_deref() { + let Some(span) = refs.spans.get(span_ref) else { + return Err(Failure::Usage(format!( + "security report warning {} references unknown span_ref {}", + warning.id, span_ref + ))); + }; + if let Some(page) = warning.page.as_deref() { + if span.page != page { + return Err(Failure::Usage(format!( + "security report warning {} span_ref {} page {} does not match page {}", + warning.id, span_ref, span.page, page + ))); + } + } + if let Some(element) = element { + if !element.span_refs.iter().any(|id| id == span_ref) { + return Err(Failure::Usage(format!( + "security report warning {} span_ref {} is not owned by element_ref {}", + warning.id, span_ref, element.id + ))); + } + } + } + Ok(()) +} + +fn deterministic_preview(text: &str) -> String { + let mut chars = text.chars(); + let preview: String = chars.by_ref().take(120).collect(); + if chars.next().is_some() { + format!("{preview}\u{2026}") + } else { + preview + } +} diff --git a/crates/ethos-cli/src/main.rs b/crates/ethos-cli/src/main.rs index e9a1819..4128d82 100644 --- a/crates/ethos-cli/src/main.rs +++ b/crates/ethos-cli/src/main.rs @@ -69,6 +69,11 @@ enum Command { #[command(subcommand)] command: RagCommand, }, + /// Security report artifacts (ethos-security) + Security { + #[command(subcommand)] + command: SecurityCommand, + }, /// Citation evidence verification (ethos-verify) Verify(VerifyArgs), /// Recompute and check a document fingerprint @@ -167,6 +172,12 @@ enum RagCommand { Chunk(RagChunkArgs), } +#[derive(Subcommand)] +enum SecurityCommand { + /// Derive security_report.json from canonical document warnings + Report(SecurityReportArgs), +} + #[derive(Args)] pub(crate) struct RagChunkArgs { /// Canonical document (`*.ethos.json`) @@ -176,6 +187,15 @@ pub(crate) struct RagChunkArgs { pub(crate) out: Option, } +#[derive(Args)] +pub(crate) struct SecurityReportArgs { + /// Canonical document (`*.ethos.json`) + pub(crate) input: PathBuf, + /// Output path for security_report.json (default: stdout) + #[arg(long)] + pub(crate) out: Option, +} + #[derive(Args)] pub(crate) struct VerifyArgs { /// Grounding input: canonical Ethos document, or foreign output with --grounding @@ -304,6 +324,9 @@ fn run(cli: Cli) -> Result<(), Failure> { Command::Rag { command: RagCommand::Chunk(args), } => cmd::rag::rag_chunk(args), + Command::Security { + command: SecurityCommand::Report(args), + } => cmd::security::security_report(args), Command::Verify(args) => cmd::verify::verify(args), Command::Fingerprint(args) => cmd::doc::fingerprint(args), Command::PdfiumWorker(args) => cmd::doc::pdfium_worker(args), diff --git a/crates/ethos-cli/tests/security_report.rs b/crates/ethos-cli/tests/security_report.rs new file mode 100644 index 0000000..022edb7 --- /dev/null +++ b/crates/ethos-cli/tests/security_report.rs @@ -0,0 +1,296 @@ +/* + * Copyright 2026 The Ethos maintainers + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::path::{Path, PathBuf}; +use std::process::{Command, Output}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use ethos_core::model::Document; +use serde_json::Value; + +fn ethos_bin() -> &'static str { + env!("CARGO_BIN_EXE_ethos") +} + +fn repo_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../..") +} + +fn document_example() -> PathBuf { + repo_root().join("schemas/examples/document.example.json") +} + +fn run_ethos(args: &[&str]) -> Output { + Command::new(ethos_bin()) + .args(args) + .output() + .expect("ethos command runs") +} + +fn temp_path(name: &str, extension: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock after unix epoch") + .as_nanos(); + std::env::temp_dir().join(format!("ethos-{name}-{nanos}.{extension}")) +} + +fn json_file(path: impl AsRef) -> Value { + let bytes = std::fs::read(path).expect("JSON fixture is readable"); + serde_json::from_slice(&bytes).expect("JSON fixture parses") +} + +fn temp_json(name: &str, json: &str) -> PathBuf { + let path = temp_path(name, "json"); + std::fs::write(&path, json).expect("temp JSON is writable"); + path +} + +fn document_with_mutated_warning(name: &str, mutate: impl FnOnce(&mut Value)) -> PathBuf { + let mut doc = json_file(document_example()); + mutate(&mut doc); + + let mut doc: Document = serde_json::from_value(doc).expect("document parses"); + doc.payload_sha256 = doc.compute_payload_sha256().expect("payload hash computes"); + doc.fingerprint = doc.compute_fingerprint().expect("fingerprint computes"); + temp_json( + name, + &serde_json::to_string(&doc).expect("document serializes"), + ) +} + +#[test] +fn security_report_derives_text_backed_warning_from_document() { + let output = run_ethos(&["security", "report", document_example().to_str().unwrap()]); + + assert!( + output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + assert_eq!(output.stderr, b""); + + let report: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); + assert_eq!(report["schema_version"], "1.0.0"); + assert_eq!( + report["document_fingerprint"], + "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3" + ); + assert_eq!(report["summary"]["hidden_text_detected"], 1); + assert_eq!(report["findings"].as_array().unwrap().len(), 1); + assert_eq!(report["findings"][0]["id"], "f0001"); + assert_eq!(report["findings"][0]["code"], "hidden_text_detected"); + assert_eq!( + report["findings"][0]["message"], + "hidden text detected: excluded from default chunks" + ); + assert_eq!(report["findings"][0]["page"], "p0001"); + assert_eq!(report["findings"][0]["span_ref"], "s000003"); + assert_eq!( + report["findings"][0]["bbox"], + serde_json::json!([100, 79100, 6000, 79200]) + ); + assert_eq!( + report["findings"][0]["text_preview"], + "internal-draft-do-not-cite" + ); + assert_eq!(report["findings"][0]["excluded_from_default_chunks"], true); + for key in ["annotations", "actions", "attachments", "scripts", "links"] { + assert_eq!(report["inventories"][key], serde_json::json!([])); + } +} + +#[test] +fn security_report_out_writes_same_bytes_as_stdout() { + let stdout_output = run_ethos(&["security", "report", document_example().to_str().unwrap()]); + assert!( + stdout_output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + stdout_output.status.code(), + String::from_utf8_lossy(&stdout_output.stderr) + ); + + let out = temp_path("security-report-out", "json"); + let output = run_ethos(&[ + "security", + "report", + document_example().to_str().unwrap(), + "--out", + out.to_str().unwrap(), + ]); + + assert!( + output.status.success(), + "ethos security report --out failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + assert_eq!(output.stdout, b""); + assert_eq!(output.stderr, b""); + assert_eq!( + std::fs::read(&out).expect("--out report is readable"), + stdout_output.stdout + ); + let _ = std::fs::remove_file(out); +} + +#[test] +fn security_report_output_is_byte_identical_across_runs() { + let first = run_ethos(&["security", "report", document_example().to_str().unwrap()]); + let second = run_ethos(&["security", "report", document_example().to_str().unwrap()]); + + assert!( + first.status.success(), + "first ethos security report failed\nstatus: {:?}\nstderr:\n{}", + first.status.code(), + String::from_utf8_lossy(&first.stderr) + ); + assert!( + second.status.success(), + "second ethos security report failed\nstatus: {:?}\nstderr:\n{}", + second.status.code(), + String::from_utf8_lossy(&second.stderr) + ); + assert_eq!(first.stderr, b""); + assert_eq!(second.stderr, b""); + assert_eq!(first.stdout, second.stdout); +} + +#[test] +fn security_report_rejects_inventory_backed_warning_without_inventory_source() { + for code in [ + "annotations_present", + "external_links_present", + "unsupported_annotation", + ] { + let document = document_with_mutated_warning("inventory-backed-security-warning", |doc| { + doc["payload"]["security_warnings"][0]["code"] = serde_json::json!(code); + doc["payload"]["security_warnings"][0]["message"] = + serde_json::json!("inventory-backed warning present"); + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("span_ref"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2), "{code}"); + assert_eq!(output.stdout, b"", "{code}"); + assert!( + String::from_utf8_lossy(&output.stderr).contains(&format!( + "security report warning w0001 ({code}) requires inventory data not available in canonical document" + )), + "{code}: {}", + String::from_utf8_lossy(&output.stderr) + ); + } +} + +#[test] +fn security_report_ignores_reportable_parser_warnings() { + let document = document_with_mutated_warning("reportable-parser-warning", |doc| { + doc["payload"]["parser_warnings"][0]["code"] = serde_json::json!("hidden_text_detected"); + doc["payload"]["parser_warnings"][0]["message"] = + serde_json::json!("parser warning must stay outside security report"); + doc["payload"]["parser_warnings"][0]["span_ref"] = serde_json::json!("s000002"); + doc["payload"]["parser_warnings"][0] + .as_object_mut() + .unwrap() + .remove("element_ref"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert!( + output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + let report: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); + assert_eq!(report["summary"]["hidden_text_detected"], 1); + assert_eq!(report["findings"].as_array().unwrap().len(), 1); +} + +#[test] +fn security_report_rejects_non_security_code_in_security_warnings() { + let document = document_with_mutated_warning("non-security-code-in-security-warning", |doc| { + doc["payload"]["security_warnings"][0]["code"] = + serde_json::json!("low_confidence_reading_order"); + doc["payload"]["security_warnings"][0]["message"] = + serde_json::json!("parser warning code in security warnings"); + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("span_ref"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains( + "security report warning w0001 (low_confidence_reading_order) is not a security warning code" + )); +} + +#[test] +fn security_report_rejects_text_warning_without_span_ref() { + let document = document_with_mutated_warning("text-warning-without-span-ref", |doc| { + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("span_ref"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("security report warning w0001 (hidden_text_detected) requires span_ref")); +} + +#[test] +fn security_report_rejects_unknown_span_ref() { + let document = document_with_mutated_warning("security-warning-unknown-span-ref", |doc| { + doc["payload"]["security_warnings"][0]["span_ref"] = serde_json::json!("s999999"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("security report warning w0001 references unknown span_ref s999999")); +} + +#[test] +fn security_report_rejects_span_not_owned_by_element_ref() { + let document = document_with_mutated_warning("security-warning-span-element-mismatch", |doc| { + doc["payload"]["security_warnings"][0]["element_ref"] = serde_json::json!("e000001"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains( + "security report warning w0001 span_ref s000003 is not owned by element_ref e000001" + )); +} From ea0cfad91c71fd80af13da62755ec0a3acdaf409 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 16:19:53 +0530 Subject: [PATCH 08/51] Validate security report page and bbox grounding Signed-off-by: docushell-admin --- crates/ethos-cli/src/cmd/security.rs | 42 ++++++++ crates/ethos-cli/tests/security_report.rs | 123 ++++++++++++++++++++++ 2 files changed, 165 insertions(+) diff --git a/crates/ethos-cli/src/cmd/security.rs b/crates/ethos-cli/src/cmd/security.rs index 1baf45e..29301ef 100644 --- a/crates/ethos-cli/src/cmd/security.rs +++ b/crates/ethos-cli/src/cmd/security.rs @@ -127,6 +127,10 @@ fn text_backed_warning_code(code: WarningCode) -> bool { ) } +fn page_backed_warning_code(code: WarningCode) -> bool { + text_backed_warning_code(code) || matches!(code, WarningCode::ImageOnlyPage) +} + fn excludes_from_default_chunks(code: WarningCode) -> bool { text_backed_warning_code(code) } @@ -204,6 +208,15 @@ fn finding_record( .spans .get(span_ref) .expect("text warning span_ref validated"); + let page_ref = warning + .page + .as_deref() + .expect("text warning page validated"); + let page = refs + .pages + .get(page_ref) + .expect("text warning page validated"); + validate_span_bbox(warning, span_ref, span, page)?; finding.insert("bbox".to_string(), serde_json::json!(span.bbox.to_array())); finding.insert( "text_preview".to_string(), @@ -218,6 +231,13 @@ fn finding_record( } fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Result<(), Failure> { + if page_backed_warning_code(warning.code) && warning.page.is_none() { + return Err(Failure::Usage(format!( + "security report warning {} ({}) requires page", + warning.id, + warning.code.as_str() + ))); + } if let Some(page) = warning.page.as_deref() { if !refs.pages.contains_key(page) { return Err(Failure::Usage(format!( @@ -280,6 +300,28 @@ fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Re Ok(()) } +fn validate_span_bbox( + warning: &Warning, + span_ref: &str, + span: &Span, + page: &Page, +) -> Result<(), Failure> { + let [x0, y0, x1, y1] = span.bbox.to_array(); + if x0 >= x1 || y0 >= y1 { + return Err(Failure::Usage(format!( + "security report warning {} span_ref {} bbox has zero area", + warning.id, span_ref + ))); + } + if x0 < 0 || y0 < 0 || x1 > page.width || y1 > page.height { + return Err(Failure::Usage(format!( + "security report warning {} span_ref {} bbox exceeds page {} bounds", + warning.id, span_ref, page.id + ))); + } + Ok(()) +} + fn deterministic_preview(text: &str) -> String { let mut chars = text.chars(); let preview: String = chars.by_ref().take(120).collect(); diff --git a/crates/ethos-cli/tests/security_report.rs b/crates/ethos-cli/tests/security_report.rs index 022edb7..38fd515 100644 --- a/crates/ethos-cli/tests/security_report.rs +++ b/crates/ethos-cli/tests/security_report.rs @@ -266,6 +266,76 @@ fn security_report_rejects_text_warning_without_span_ref() { .contains("security report warning w0001 (hidden_text_detected) requires span_ref")); } +#[test] +fn security_report_rejects_text_warning_without_page() { + let document = document_with_mutated_warning("text-warning-without-page", |doc| { + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("page"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("security report warning w0001 (hidden_text_detected) requires page")); +} + +#[test] +fn security_report_rejects_image_only_warning_without_page() { + let document = document_with_mutated_warning("image-only-warning-without-page", |doc| { + doc["payload"]["security_warnings"][0]["code"] = serde_json::json!("image_only_page"); + doc["payload"]["security_warnings"][0]["message"] = serde_json::json!("image-only page"); + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("span_ref"); + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("page"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("security report warning w0001 (image_only_page) requires page")); +} + +#[test] +fn security_report_derives_image_only_page_warning() { + let document = document_with_mutated_warning("image-only-warning", |doc| { + doc["payload"]["security_warnings"][0]["code"] = serde_json::json!("image_only_page"); + doc["payload"]["security_warnings"][0]["message"] = serde_json::json!("image-only page"); + doc["payload"]["security_warnings"][0] + .as_object_mut() + .unwrap() + .remove("span_ref"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert!( + output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + assert_eq!(output.stderr, b""); + let report: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); + assert_eq!(report["summary"]["image_only_page"], 1); + assert_eq!(report["findings"].as_array().unwrap().len(), 1); + assert_eq!(report["findings"][0]["code"], "image_only_page"); + assert_eq!(report["findings"][0]["page"], "p0001"); + assert!(report["findings"][0].get("bbox").is_none()); + assert!(report["findings"][0].get("span_ref").is_none()); + assert_eq!(report["findings"][0]["excluded_from_default_chunks"], false); +} + #[test] fn security_report_rejects_unknown_span_ref() { let document = document_with_mutated_warning("security-warning-unknown-span-ref", |doc| { @@ -294,3 +364,56 @@ fn security_report_rejects_span_not_owned_by_element_ref() { "security report warning w0001 span_ref s000003 is not owned by element_ref e000001" )); } + +#[test] +fn security_report_rejects_invalid_text_warning_span_bbox() { + for (name, bbox, expected) in [ + ( + "zero-width", + serde_json::json!([100, 79100, 100, 79200]), + "security report warning w0001 span_ref s000003 bbox has zero area", + ), + ( + "zero-height", + serde_json::json!([100, 79100, 6000, 79100]), + "security report warning w0001 span_ref s000003 bbox has zero area", + ), + ( + "negative-x", + serde_json::json!([-1, 79100, 6000, 79200]), + "security report warning w0001 span_ref s000003 bbox exceeds page p0001 bounds", + ), + ( + "negative-y", + serde_json::json!([100, -1, 6000, 79200]), + "security report warning w0001 span_ref s000003 bbox exceeds page p0001 bounds", + ), + ( + "x-exceeds-page", + serde_json::json!([100, 79100, 61201, 79200]), + "security report warning w0001 span_ref s000003 bbox exceeds page p0001 bounds", + ), + ( + "y-exceeds-page", + serde_json::json!([100, 79100, 6000, 79201]), + "security report warning w0001 span_ref s000003 bbox exceeds page p0001 bounds", + ), + ] { + let document = document_with_mutated_warning( + &format!("security-warning-invalid-span-bbox-{name}"), + |doc| { + doc["payload"]["spans"][2]["bbox"] = bbox; + }, + ); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2), "{name}"); + assert_eq!(output.stdout, b"", "{name}"); + assert!( + String::from_utf8_lossy(&output.stderr).contains(expected), + "{name}: {}", + String::from_utf8_lossy(&output.stderr) + ); + } +} From c2fce0de56652100e169f723d935744c8d28a04e Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 17:38:17 +0530 Subject: [PATCH 09/51] Fail closed on security report warning lane drift Signed-off-by: docushell-admin --- crates/ethos-cli/src/cmd/security.rs | 15 ++++++++++ crates/ethos-cli/tests/security_report.rs | 35 +++++++++++++++-------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/crates/ethos-cli/src/cmd/security.rs b/crates/ethos-cli/src/cmd/security.rs index 29301ef..534d63a 100644 --- a/crates/ethos-cli/src/cmd/security.rs +++ b/crates/ethos-cli/src/cmd/security.rs @@ -31,6 +31,15 @@ pub(crate) fn security_report(args: SecurityReportArgs) -> Result<(), Failure> { fn security_report_output_bytes(doc: &Document) -> Result, Failure> { let refs = SecurityReportRefs::new(doc); + for warning in &doc.payload.parser_warnings { + if warning.code.is_security() { + return Err(Failure::Usage(format!( + "security report parser warning {} ({}) must be in security_warnings", + warning.id, + warning.code.as_str() + ))); + } + } let mut warnings = Vec::with_capacity(doc.payload.security_warnings.len()); for warning in &doc.payload.security_warnings { if !warning.code.is_security() { @@ -231,6 +240,12 @@ fn finding_record( } fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Result<(), Failure> { + if let Some(region_ref) = warning.region_ref.as_deref() { + return Err(Failure::Usage(format!( + "security report warning {} region_ref {} is unsupported until security report schema supports region_ref", + warning.id, region_ref + ))); + } if page_backed_warning_code(warning.code) && warning.page.is_none() { return Err(Failure::Usage(format!( "security report warning {} ({}) requires page", diff --git a/crates/ethos-cli/tests/security_report.rs b/crates/ethos-cli/tests/security_report.rs index 38fd515..1e3086d 100644 --- a/crates/ethos-cli/tests/security_report.rs +++ b/crates/ethos-cli/tests/security_report.rs @@ -202,11 +202,11 @@ fn security_report_rejects_inventory_backed_warning_without_inventory_source() { } #[test] -fn security_report_ignores_reportable_parser_warnings() { - let document = document_with_mutated_warning("reportable-parser-warning", |doc| { +fn security_report_rejects_security_code_in_parser_warnings() { + let document = document_with_mutated_warning("security-code-in-parser-warning", |doc| { doc["payload"]["parser_warnings"][0]["code"] = serde_json::json!("hidden_text_detected"); doc["payload"]["parser_warnings"][0]["message"] = - serde_json::json!("parser warning must stay outside security report"); + serde_json::json!("security warning code in parser warnings"); doc["payload"]["parser_warnings"][0]["span_ref"] = serde_json::json!("s000002"); doc["payload"]["parser_warnings"][0] .as_object_mut() @@ -216,15 +216,11 @@ fn security_report_ignores_reportable_parser_warnings() { let output = run_ethos(&["security", "report", document.to_str().unwrap()]); - assert!( - output.status.success(), - "ethos security report failed\nstatus: {:?}\nstderr:\n{}", - output.status.code(), - String::from_utf8_lossy(&output.stderr) - ); - let report: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); - assert_eq!(report["summary"]["hidden_text_detected"], 1); - assert_eq!(report["findings"].as_array().unwrap().len(), 1); + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains( + "security report parser warning w0002 (hidden_text_detected) must be in security_warnings" + )); } #[test] @@ -266,6 +262,21 @@ fn security_report_rejects_text_warning_without_span_ref() { .contains("security report warning w0001 (hidden_text_detected) requires span_ref")); } +#[test] +fn security_report_rejects_region_ref_until_report_schema_support() { + let document = document_with_mutated_warning("security-warning-region-ref", |doc| { + doc["payload"]["security_warnings"][0]["region_ref"] = serde_json::json!("r0001"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains( + "security report warning w0001 region_ref r0001 is unsupported until security report schema supports region_ref" + )); +} + #[test] fn security_report_rejects_text_warning_without_page() { let document = document_with_mutated_warning("text-warning-without-page", |doc| { From 295df8494f2f87d28b029323acaa18f2d8b59315 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:12:16 +0530 Subject: [PATCH 10/51] Align security report validator warning lanes Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 70 ++++++++++++++++++++-- schemas/test_security_report_validation.py | 47 ++++++++++++++- 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index af98de8..45fa179 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -20,7 +20,7 @@ from __future__ import annotations -REPORTABLE_WARNING_CODES = { +SECURITY_WARNING_CODES = { "hidden_text_detected", "off_page_text_detected", "low_contrast_text_detected", @@ -30,6 +30,16 @@ "image_only_page", } +REPORTABLE_WARNING_CODES = SECURITY_WARNING_CODES + +INVENTORY_BACKED_FINDING_CODES = { + "annotations_present", + "external_links_present", + "unsupported_annotation", +} + +WARNING_DERIVED_FINDING_CODES = SECURITY_WARNING_CODES - INVENTORY_BACKED_FINDING_CODES + DEFAULT_CHUNK_EXCLUDED_CODES = { "hidden_text_detected", "off_page_text_detected", @@ -52,12 +62,14 @@ def diagnose_security_report_example( ): diagnostics = [] payload = document.get("payload") if isinstance(document, dict) else {} - warnings = [] + security_warnings = [] + parser_warnings = [] if isinstance(payload, dict): - warnings.extend(payload.get("security_warnings", [])) - warnings.extend(payload.get("parser_warnings", [])) + security_warnings = warning_items(payload.get("security_warnings", [])) + parser_warnings = warning_items(payload.get("parser_warnings", [])) refs = document_reference_index(payload) diagnose_report_identity(document, report, ctx, diagnostics) + diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] if not isinstance(findings, list): @@ -68,7 +80,7 @@ def diagnose_security_report_example( warning_derived_findings = [ projected_warning_finding(warning) - for warning in warnings + for warning in security_warnings if isinstance(warning, dict) and warning.get("code") in REPORTABLE_WARNING_CODES ] actual_projected_findings = [ @@ -88,6 +100,19 @@ def diagnose_security_report_example( f"{ctx}: missing warning-derived finding for {expected['code']}" ) + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + code = finding.get("code") + if code not in WARNING_DERIVED_FINDING_CODES: + continue + projected = project_report_finding(finding) + if projected not in warning_derived_findings: + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)} has no matching " + f"security_warnings entry for {code}" + ) + for code in sorted({finding["code"] for finding in warning_derived_findings}): expected_count = sum( 1 for finding in warning_derived_findings if finding["code"] == code @@ -148,6 +173,34 @@ def diagnose_security_report_example( return diagnostics +def warning_items(value): + if isinstance(value, list): + return value + return [] + + +def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics): + for warning in parser_warnings: + if not isinstance(warning, dict): + continue + code = warning.get("code") + if code in SECURITY_WARNING_CODES: + diagnostics.append( + f"{ctx}: parser warning {warning_id(warning)} ({code}) " + "must be in security_warnings" + ) + + for warning in security_warnings: + if not isinstance(warning, dict): + continue + code = warning.get("code") + if isinstance(code, str) and code not in SECURITY_WARNING_CODES: + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)} ({code}) " + "is not a security warning code" + ) + + def projected_warning_finding(warning): projected = { "code": warning.get("code"), @@ -422,3 +475,10 @@ def finding_ctx(finding, index): if isinstance(finding_id, str): return f"finding {finding_id}" return f"findings[{index}]" + + +def warning_id(warning): + identifier = warning.get("id") + if isinstance(identifier, str): + return identifier + return "" diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 5de2a5f..19b9235 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -201,6 +201,27 @@ def test_stale_summary_without_matching_finding_fails_closed(self) -> None: diagnostics, ) + def test_unexpected_warning_derived_report_finding_fails_closed(self) -> None: + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": "image_only_page", + "message": "image-only page", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) + report["summary"]["image_only_page"] = 1 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0004 has no matching " + "security_warnings entry for image_only_page", + diagnostics, + ) + def test_warning_refs_must_match_report_finding_projection(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["span_ref"] = "s999999" @@ -580,7 +601,7 @@ def test_inventories_must_be_deterministic_object(self) -> None: diagnostics, ) - def test_reportable_parser_warning_codes_are_included_when_present(self) -> None: + def test_security_codes_in_parser_warnings_fail_closed(self) -> None: document = copy.deepcopy(self.document) document["payload"]["parser_warnings"].append( { @@ -594,10 +615,34 @@ def test_reportable_parser_warning_codes_are_included_when_present(self) -> None diagnostics = diagnose_security_report_example(document, self.report) self.assertIn( + "security-report.example.json: parser warning w0099 (image_only_page) " + "must be in security_warnings", + diagnostics, + ) + self.assertNotIn( "security-report.example.json: missing warning-derived finding for image_only_page", diagnostics, ) + def test_parser_codes_in_security_warnings_fail_closed(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": "low_confidence_reading_order", + "message": "parser warning placed in security lane", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099 " + "(low_confidence_reading_order) is not a security warning code", + diagnostics, + ) + if __name__ == "__main__": unittest.main() From 8b0b8cb4718dd04b63610af1d74aa4a3f729e71d Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:19:44 +0530 Subject: [PATCH 11/51] Reject stale security report summary drift Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 8 +++++++ schemas/test_security_report_validation.py | 25 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 45fa179..7a4ed39 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -123,8 +123,16 @@ def diagnose_security_report_example( "for warning-derived findings" ) + for code in sorted(summary.keys()): + if code not in SECURITY_WARNING_CODES: + diagnostics.append(f"{ctx}: summary.{code} is not a security report code") + for code in sorted(set(summary.keys()) | set(finding_counts.keys())): expected_count = finding_counts.get(code, 0) + if code in summary and summary.get(code) == 0 and expected_count == 0: + diagnostics.append( + f"{ctx}: summary.{code} must be omitted when no report findings use that code" + ) if summary.get(code, 0) != expected_count: diagnostics.append( f"{ctx}: summary.{code} must be {expected_count} for report findings" diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 19b9235..c2cc5b2 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -169,6 +169,31 @@ def test_summary_must_match_all_report_finding_counts(self) -> None: diagnostics, ) + def test_zero_count_summary_keys_must_be_omitted(self) -> None: + report = copy.deepcopy(self.report) + report["summary"]["image_only_page"] = 0 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: summary.image_only_page must be omitted " + "when no report findings use that code", + diagnostics, + ) + + def test_unknown_summary_keys_fail_closed(self) -> None: + for value in (0, 1): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["summary"]["unknown_code"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: summary.unknown_code is not a security report code", + diagnostics, + ) + def test_document_security_warnings_must_have_matching_findings(self) -> None: report = copy.deepcopy(self.report) report["findings"] = [ From 11ef0ed0837ec0ef33e395430c0bfa6002e176a3 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:23:57 +0530 Subject: [PATCH 12/51] Validate unsupported annotation report parity Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 14 +++++++++ schemas/test_security_report_validation.py | 33 ++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 7a4ed39..a1f7c1c 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -154,6 +154,11 @@ def diagnose_security_report_example( } annotations = inventory_lists["annotations"] links = inventory_lists["links"] + unsupported_annotations = [ + annotation + for annotation in annotations + if isinstance(annotation, dict) and annotation.get("supported") is False + ] external_links = [ link for link in links if isinstance(link, dict) and link.get("external") is True ] @@ -167,6 +172,15 @@ def diagnose_security_report_example( f"{ctx}: annotations_present finding requires inventories.annotations entry" ) + if unsupported_annotations and finding_counts.get("unsupported_annotation", 0) == 0: + diagnostics.append( + f"{ctx}: inventories.annotations supported=false requires unsupported_annotation finding" + ) + if finding_counts.get("unsupported_annotation", 0) > 0 and not unsupported_annotations: + diagnostics.append( + f"{ctx}: unsupported_annotation finding requires inventories.annotations supported=false entry" + ) + if external_links and finding_counts.get("external_links_present", 0) == 0: diagnostics.append( f"{ctx}: inventories.links external=true requires external_links_present finding" diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index c2cc5b2..aca95e2 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -505,6 +505,39 @@ def test_annotations_finding_requires_inventory_entry(self) -> None: diagnostics, ) + def test_unsupported_annotation_inventory_requires_matching_finding(self) -> None: + report = copy.deepcopy(self.report) + report["inventories"]["annotations"][0]["supported"] = False + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.annotations supported=false " + "requires unsupported_annotation finding", + diagnostics, + ) + + def test_unsupported_annotation_finding_requires_inventory_entry(self) -> None: + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": "unsupported_annotation", + "message": "unsupported annotation", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) + report["summary"]["unsupported_annotation"] = 1 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: unsupported_annotation finding requires " + "inventories.annotations supported=false entry", + diagnostics, + ) + def test_inventory_page_refs_must_exist_in_document(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["annotations"][0]["page"] = "p9999" From 75369192788243b0be4b6c90a605c9535dc9ac26 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:31:29 +0530 Subject: [PATCH 13/51] Require security report inventory lanes Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 5 ++++- schemas/test_security_report_validation.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index a1f7c1c..cd7d990 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -324,7 +324,10 @@ def project_report_finding(finding): def inventory_items(inventories, name, ctx, diagnostics): - items = inventories.get(name, []) + if name not in inventories: + diagnostics.append(f"{ctx}: inventories.{name} is required") + return [] + items = inventories.get(name) if not isinstance(items, list): diagnostics.append(f"{ctx}: inventories.{name} must be an array") return [] diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index aca95e2..378b04c 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -637,6 +637,19 @@ def test_inventory_shape_must_be_deterministic_arrays(self) -> None: diagnostics, ) + def test_required_inventory_lanes_must_be_present(self) -> None: + for name in ("annotations", "actions", "attachments", "scripts", "links"): + with self.subTest(name=name): + report = copy.deepcopy(self.report) + report["inventories"].pop(name) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name} is required", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From 2216b3cdc03cd60304f7cd7771932157759f14ab Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:37:05 +0530 Subject: [PATCH 14/51] Validate unsupported annotation finding messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 1 + schemas/test_security_report_validation.py | 26 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index cd7d990..121f7a1 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -52,6 +52,7 @@ "hidden_text_detected": "hidden text detected: excluded from default chunks", "annotations_present": "annotations present on page", "external_links_present": "external links present on page", + "unsupported_annotation": "unsupported annotation ignored", } diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 378b04c..af815ba 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -145,6 +145,30 @@ def test_external_link_finding_message_must_match_fixed_template(self) -> None: diagnostics, ) + def test_unsupported_annotation_finding_message_must_match_fixed_template( + self, + ) -> None: + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": "unsupported_annotation", + "message": "unsupported annotation changed", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) + report["summary"]["unsupported_annotation"] = 1 + report["inventories"]["annotations"][0]["supported"] = False + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0004 message must match " + "fixed template for unsupported_annotation", + diagnostics, + ) + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: report = copy.deepcopy(self.report) report["summary"]["hidden_text_detected"] = 2 @@ -523,7 +547,7 @@ def test_unsupported_annotation_finding_requires_inventory_entry(self) -> None: { "id": "f0004", "code": "unsupported_annotation", - "message": "unsupported annotation", + "message": "unsupported annotation ignored", "page": "p0001", "excluded_from_default_chunks": False, } From 9ad4c12c4a2d65d77df618596f0baa9b91dde52a Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 19:47:05 +0530 Subject: [PATCH 15/51] Validate image-only page finding messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 1 + schemas/test_security_report_validation.py | 30 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 121f7a1..168cfef 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -53,6 +53,7 @@ "annotations_present": "annotations present on page", "external_links_present": "external links present on page", "unsupported_annotation": "unsupported annotation ignored", + "image_only_page": "image-only page", } diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index af815ba..b5dddb5 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -169,6 +169,36 @@ def test_unsupported_annotation_finding_message_must_match_fixed_template( diagnostics, ) + def test_image_only_page_finding_message_must_match_fixed_template(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": "image_only_page", + "message": "image-only page changed", + "page": "p0001", + } + ) + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": "image_only_page", + "message": "image-only page changed", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) + report["summary"]["image_only_page"] = 1 + + diagnostics = diagnose_security_report_example(document, report) + + self.assertIn( + "security-report.example.json: finding f0004 message must match " + "fixed template for image_only_page", + diagnostics, + ) + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: report = copy.deepcopy(self.report) report["summary"]["hidden_text_detected"] = 2 From 7b6e1f6d3ad7b892123cd8b0e567c5763367084f Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:07:25 +0530 Subject: [PATCH 16/51] Validate text exclusion finding messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 2 ++ schemas/test_security_report_validation.py | 39 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 168cfef..c393ca7 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -50,6 +50,8 @@ FINDING_MESSAGE_TEMPLATES = { "hidden_text_detected": "hidden text detected: excluded from default chunks", + "off_page_text_detected": "off-page text detected: excluded from default chunks", + "low_contrast_text_detected": "low-contrast text detected: excluded from default chunks", "annotations_present": "annotations present on page", "external_links_present": "external links present on page", "unsupported_annotation": "unsupported annotation ignored", diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index b5dddb5..4f8f84f 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -121,6 +121,45 @@ def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: diagnostics, ) + def test_text_exclusion_finding_messages_must_match_fixed_templates(self) -> None: + for code, changed_message in ( + ("off_page_text_detected", "off-page text changed"), + ("low_contrast_text_detected", "low-contrast text changed"), + ): + with self.subTest(code=code): + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": code, + "message": changed_message, + "page": "p0001", + "span_ref": "s000003", + } + ) + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": code, + "message": changed_message, + "page": "p0001", + "span_ref": "s000003", + "bbox": [100, 79100, 6000, 79200], + "text_preview": "internal-draft-do-not-cite", + "excluded_from_default_chunks": True, + } + ) + report["summary"][code] = 1 + + diagnostics = diagnose_security_report_example(document, report) + + self.assertIn( + "security-report.example.json: finding f0004 message must match " + f"fixed template for {code}", + diagnostics, + ) + def test_annotation_finding_message_must_match_fixed_template(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["message"] = "annotations changed" From 6e1d03a53c0abcd89c069a4dba4ebf76c41624a9 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:13:42 +0530 Subject: [PATCH 17/51] Validate security warning source messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 17 +++++++++++ schemas/test_security_report_validation.py | 35 +++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index c393ca7..a0643b3 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -74,6 +74,7 @@ def diagnose_security_report_example( refs = document_reference_index(payload) diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) + diagnose_security_warning_messages(security_warnings, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] if not isinstance(findings, list): @@ -227,6 +228,22 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) ) +def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): + for warning in security_warnings: + if not isinstance(warning, dict): + continue + code = warning.get("code") + expected_message = FINDING_MESSAGE_TEMPLATES.get(code) + if expected_message is None: + continue + actual_message = warning.get("message") + if actual_message != expected_message: + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)} " + f"message must match fixed template for {code}" + ) + + def projected_warning_finding(warning): projected = { "code": warning.get("code"), diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 4f8f84f..67bd2fd 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -22,7 +22,11 @@ import unittest from pathlib import Path -from security_report_validation import diagnose_security_report_example +from security_report_validation import ( + FINDING_MESSAGE_TEMPLATES, + SECURITY_WARNING_CODES, + diagnose_security_report_example, +) ROOT = Path(__file__).resolve().parent @@ -37,6 +41,9 @@ def setUp(self) -> None: def test_current_examples_are_coherent(self) -> None: self.assertEqual(diagnose_security_report_example(self.document, self.report), []) + def test_all_security_warning_codes_have_fixed_message_templates(self) -> None: + self.assertEqual(set(FINDING_MESSAGE_TEMPLATES), SECURITY_WARNING_CODES) + def test_schema_version_must_match_document(self) -> None: report = copy.deepcopy(self.report) report["schema_version"] = "1.0.1" @@ -121,6 +128,27 @@ def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: diagnostics, ) + def test_security_warning_message_must_match_fixed_template(self) -> None: + for code in sorted(SECURITY_WARNING_CODES): + with self.subTest(code=code): + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": code, + "message": "security warning changed", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099 message " + f"must match fixed template for {code}", + diagnostics, + ) + def test_text_exclusion_finding_messages_must_match_fixed_templates(self) -> None: for code, changed_message in ( ("off_page_text_detected", "off-page text changed"), @@ -787,6 +815,11 @@ def test_security_codes_in_parser_warnings_fail_closed(self) -> None: "security-report.example.json: missing warning-derived finding for image_only_page", diagnostics, ) + self.assertNotIn( + "security-report.example.json: security warning w0099 message must " + "match fixed template for image_only_page", + diagnostics, + ) def test_parser_codes_in_security_warnings_fail_closed(self) -> None: document = copy.deepcopy(self.document) From 8a4a9b98157a42f2cf84d6aa387ab20d2b2d1284 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:19:10 +0530 Subject: [PATCH 18/51] Validate security report finding codes Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 16 ++++++++++ schemas/test_security_report_validation.py | 36 ++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index a0643b3..b690643 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -144,6 +144,7 @@ def diagnose_security_report_example( ) diagnose_finding_ids(findings, ctx, diagnostics) + diagnose_finding_codes(findings, ctx, diagnostics) diagnose_finding_messages(findings, ctx, diagnostics) diagnose_finding_exclusion_flags(findings, ctx, diagnostics) diagnose_findings_references(findings, refs, ctx, diagnostics) @@ -295,6 +296,21 @@ def diagnose_finding_ids(findings, ctx, diagnostics): seen.add(finding_id) +def diagnose_finding_codes(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + code = finding.get("code") + if not isinstance(code, str): + diagnostics.append(f"{ctx}: {finding_ctx(finding, index)} code is required") + continue + if code not in SECURITY_WARNING_CODES: + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)} code {code} " + "is not a security report code" + ) + + def diagnose_finding_messages(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 67bd2fd..69b96eb 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -116,6 +116,42 @@ def test_finding_ids_must_be_unique(self) -> None: diagnostics, ) + def test_finding_codes_must_be_security_report_codes(self) -> None: + report = copy.deepcopy(self.report) + report["findings"].append( + { + "id": "f0004", + "code": "unknown_code", + "message": "unknown", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0004 code unknown_code " + "is not a security report code", + diagnostics, + ) + + def test_finding_codes_are_required(self) -> None: + for value in (None, 7): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + if value is None: + report["findings"][0].pop("code") + else: + report["findings"][0]["code"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 code is required", + diagnostics, + ) + def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["message"] = "hidden text changed" From d2a31528a474700d31fa43bf1a3d1b1299b6745e Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:22:38 +0530 Subject: [PATCH 19/51] Validate security report inventory fields Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 21 +++++++++++++++ schemas/test_security_report_validation.py | 31 ++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index b690643..a1d2832 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -58,6 +58,14 @@ "image_only_page": "image-only page", } +INVENTORY_REQUIRED_FIELDS = { + "annotations": ("page", "kind"), + "actions": ("kind",), + "attachments": ("name", "bytes"), + "scripts": ("location",), + "links": ("page", "uri", "external"), +} + def diagnose_security_report_example( document, @@ -158,6 +166,7 @@ def diagnose_security_report_example( name: inventory_items(inventories, name, ctx, diagnostics) for name in ("annotations", "actions", "attachments", "scripts", "links") } + diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics) annotations = inventory_lists["annotations"] links = inventory_lists["links"] unsupported_annotations = [ @@ -371,6 +380,18 @@ def inventory_items(inventories, name, ctx, diagnostics): return items +def diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics): + for name, required_fields in INVENTORY_REQUIRED_FIELDS.items(): + for index, item in enumerate(inventory_lists.get(name, [])): + if not isinstance(item, dict): + continue + for field in required_fields: + if field not in item: + diagnostics.append( + f"{ctx}: inventories.{name}[{index}].{field} is required" + ) + + def document_reference_index(payload): if not isinstance(payload, dict): return {"pages": {}, "elements": {}, "spans": {}} diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 69b96eb..29c8ed4 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -807,6 +807,37 @@ def test_required_inventory_lanes_must_be_present(self) -> None: diagnostics, ) + def test_required_inventory_item_fields_must_be_present(self) -> None: + inventory_items = { + "annotations": {"page": "p0001", "kind": "link"}, + "actions": {"kind": "uri"}, + "attachments": {"name": "attachment.bin", "bytes": 0}, + "scripts": {"location": "document"}, + "links": {"page": "p0001", "uri": "https://example.com/q3", "external": True}, + } + required_fields = { + "annotations": ("page", "kind"), + "actions": ("kind",), + "attachments": ("name", "bytes"), + "scripts": ("location",), + "links": ("page", "uri", "external"), + } + + for name, fields in required_fields.items(): + for field in fields: + with self.subTest(name=name, field=field): + report = copy.deepcopy(self.report) + report["inventories"][name] = [copy.deepcopy(inventory_items[name])] + report["inventories"][name][0].pop(field) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0].{field} " + "is required", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From 433651fd4b85c6a7db36334df60e32cfb8c80895 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:28:17 +0530 Subject: [PATCH 20/51] Validate security report envelope fields Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 89 +++++++++++++++++----- schemas/test_security_report_validation.py | 73 ++++++++++++++++++ 2 files changed, 144 insertions(+), 18 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index a1d2832..3c4b851 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -66,6 +66,18 @@ "links": ("page", "uri", "external"), } +REPORT_REQUIRED_FIELDS = ( + "schema_version", + "document_fingerprint", + "source_fingerprint", + "profile", + "summary", + "findings", + "inventories", +) + +PROFILE_REQUIRED_FIELDS = ("id", "sha256") + def diagnose_security_report_example( document, @@ -80,16 +92,19 @@ def diagnose_security_report_example( security_warnings = warning_items(payload.get("security_warnings", [])) parser_warnings = warning_items(payload.get("parser_warnings", [])) refs = document_reference_index(payload) + diagnose_report_required_fields(report, ctx, diagnostics) diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] if not isinstance(findings, list): - return [f"{ctx}: findings must be an array"] + diagnostics.append(f"{ctx}: findings must be an array") + return diagnostics summary = report.get("summary") if isinstance(report, dict) else {} if not isinstance(summary, dict): - return [f"{ctx}: summary must be an object"] + diagnostics.append(f"{ctx}: summary must be an object") + return diagnostics warning_derived_findings = [ projected_warning_finding(warning) @@ -269,25 +284,63 @@ def projected_warning_finding(warning): def diagnose_report_identity(document, report, ctx, diagnostics): if not isinstance(document, dict) or not isinstance(report, dict): return - expected = { - "schema_version": document.get("schema_version"), - "document_fingerprint": document.get("fingerprint"), - "source_fingerprint": nested_get(document, "source", "fingerprint"), - "profile.id": nested_get(document, "profile", "id"), - "profile.sha256": nested_get(document, "profile", "sha256"), - } - actual = { - "schema_version": report.get("schema_version"), - "document_fingerprint": report.get("document_fingerprint"), - "source_fingerprint": report.get("source_fingerprint"), - "profile.id": nested_get(report, "profile", "id"), - "profile.sha256": nested_get(report, "profile", "sha256"), - } - for key, want in expected.items(): - if want is not None and actual.get(key) != want: + profile = report.get("profile") + profile_is_object = isinstance(profile, dict) + comparisons = ( + ( + "schema_version", + document.get("schema_version"), + "schema_version" in report, + report.get("schema_version"), + ), + ( + "document_fingerprint", + document.get("fingerprint"), + "document_fingerprint" in report, + report.get("document_fingerprint"), + ), + ( + "source_fingerprint", + nested_get(document, "source", "fingerprint"), + "source_fingerprint" in report, + report.get("source_fingerprint"), + ), + ( + "profile.id", + nested_get(document, "profile", "id"), + profile_is_object and "id" in profile, + profile.get("id") if profile_is_object else None, + ), + ( + "profile.sha256", + nested_get(document, "profile", "sha256"), + profile_is_object and "sha256" in profile, + profile.get("sha256") if profile_is_object else None, + ), + ) + for key, want, actual_present, actual in comparisons: + if want is not None and actual_present and actual != want: diagnostics.append(f"{ctx}: {key} diverges from document example") +def diagnose_report_required_fields(report, ctx, diagnostics): + if not isinstance(report, dict): + diagnostics.append(f"{ctx}: report must be an object") + return + for field in REPORT_REQUIRED_FIELDS: + if field not in report: + diagnostics.append(f"{ctx}: {field} is required") + if "profile" not in report: + return + profile = report.get("profile") + if not isinstance(profile, dict): + diagnostics.append(f"{ctx}: profile must be an object") + return + for field in PROFILE_REQUIRED_FIELDS: + if field not in profile: + diagnostics.append(f"{ctx}: profile.{field} is required") + + def diagnose_finding_ids(findings, ctx, diagnostics): seen = set() for index, finding in enumerate(findings): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 29c8ed4..e294623 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -93,6 +93,79 @@ def test_profile_identity_must_match_document_profile(self) -> None: diagnostics, ) + def test_report_must_be_object(self) -> None: + diagnostics = diagnose_security_report_example(self.document, []) + + self.assertIn( + "security-report.example.json: report must be an object", + diagnostics, + ) + + def test_top_level_report_fields_are_required(self) -> None: + identity_diagnostics = { + "schema_version": "security-report.example.json: schema_version diverges from document example", + "document_fingerprint": "security-report.example.json: document_fingerprint diverges from document example", + "source_fingerprint": "security-report.example.json: source_fingerprint diverges from document example", + } + for field in ( + "schema_version", + "document_fingerprint", + "source_fingerprint", + "profile", + "summary", + "findings", + "inventories", + ): + with self.subTest(field=field): + report = copy.deepcopy(self.report) + report.pop(field) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: {field} is required", + diagnostics, + ) + if field in identity_diagnostics: + self.assertNotIn(identity_diagnostics[field], diagnostics) + + def test_profile_must_be_object(self) -> None: + report = copy.deepcopy(self.report) + report["profile"] = [] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: profile must be an object", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: profile.id diverges from document example", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: profile.sha256 diverges from document example", + diagnostics, + ) + + def test_profile_fields_are_required(self) -> None: + identity_diagnostics = { + "id": "security-report.example.json: profile.id diverges from document example", + "sha256": "security-report.example.json: profile.sha256 diverges from document example", + } + for field in ("id", "sha256"): + with self.subTest(field=field): + report = copy.deepcopy(self.report) + report["profile"].pop(field) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: profile.{field} is required", + diagnostics, + ) + self.assertNotIn(identity_diagnostics[field], diagnostics) + def test_finding_ids_must_be_contiguous_in_report_order(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["id"] = "f0004" From 1efb57a44e93843a9972dcf768fa0f32252a0f7c Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:33:40 +0530 Subject: [PATCH 21/51] Validate security report finding fields Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 47 ++++++++++++++++++++++ schemas/test_security_report_validation.py | 40 ++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 3c4b851..30abef5 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -78,6 +78,8 @@ PROFILE_REQUIRED_FIELDS = ("id", "sha256") +FINDING_REQUIRED_FIELDS = ("id", "message", "excluded_from_default_chunks") + def diagnose_security_report_example( document, @@ -124,6 +126,8 @@ def diagnose_security_report_example( for expected in warning_derived_findings: if expected not in actual_projected_findings: + if has_incomplete_projected_finding(findings, expected): + continue diagnostics.append( f"{ctx}: missing warning-derived finding for {expected['code']}" ) @@ -134,6 +138,8 @@ def diagnose_security_report_example( code = finding.get("code") if code not in WARNING_DERIVED_FINDING_CODES: continue + if projected_finding_fields_missing(finding): + continue projected = project_report_finding(finding) if projected not in warning_derived_findings: diagnostics.append( @@ -166,6 +172,7 @@ def diagnose_security_report_example( f"{ctx}: summary.{code} must be {expected_count} for report findings" ) + diagnose_finding_required_fields(findings, ctx, diagnostics) diagnose_finding_ids(findings, ctx, diagnostics) diagnose_finding_codes(findings, ctx, diagnostics) diagnose_finding_messages(findings, ctx, diagnostics) @@ -346,6 +353,8 @@ def diagnose_finding_ids(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): continue + if "id" not in finding: + continue finding_id = finding.get("id") expected_id = f"f{index + 1:04d}" if finding_id != expected_id: @@ -358,6 +367,16 @@ def diagnose_finding_ids(findings, ctx, diagnostics): seen.add(finding_id) +def diagnose_finding_required_fields(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + item_ctx = finding_ctx(finding, index) + for field in FINDING_REQUIRED_FIELDS: + if field not in finding: + diagnostics.append(f"{ctx}: {item_ctx}.{field} is required") + + def diagnose_finding_codes(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): @@ -381,6 +400,8 @@ def diagnose_finding_messages(findings, ctx, diagnostics): expected_message = FINDING_MESSAGE_TEMPLATES.get(code) if expected_message is None: continue + if "message" not in finding: + continue actual_message = finding.get("message") if actual_message != expected_message: diagnostics.append( @@ -395,6 +416,8 @@ def diagnose_finding_exclusion_flags(findings, ctx, diagnostics): code = finding.get("code") if not isinstance(code, str): continue + if "excluded_from_default_chunks" not in finding: + continue expected = code in DEFAULT_CHUNK_EXCLUDED_CODES if finding.get("excluded_from_default_chunks") != expected: diagnostics.append( @@ -422,6 +445,30 @@ def project_report_finding(finding): return projected +def has_incomplete_projected_finding(findings, expected): + for finding in findings: + if not isinstance(finding, dict): + continue + if not projected_finding_fields_missing(finding): + continue + if warning_locator_matches(finding, expected): + return True + return False + + +def projected_finding_fields_missing(finding): + return "message" not in finding or "excluded_from_default_chunks" not in finding + + +def warning_locator_matches(finding, expected): + if finding.get("code") != expected.get("code"): + return False + for key in ("page", "element_ref", "span_ref"): + if finding.get(key) != expected.get(key): + return False + return True + + def inventory_items(inventories, name, ctx, diagnostics): if name not in inventories: diagnostics.append(f"{ctx}: inventories.{name} is required") diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index e294623..a65953e 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -225,6 +225,46 @@ def test_finding_codes_are_required(self) -> None: diagnostics, ) + def test_finding_required_fields_must_be_present(self) -> None: + expected_diagnostics = { + "id": "security-report.example.json: findings[0].id is required", + "message": "security-report.example.json: finding f0001.message is required", + "excluded_from_default_chunks": ( + "security-report.example.json: " + "finding f0001.excluded_from_default_chunks is required" + ), + } + suppressed_diagnostics = { + "id": "security-report.example.json: findings[0].id must be f0001 " + "for deterministic numbering", + "message": ( + "security-report.example.json: finding f0001 message must match " + "fixed template for hidden_text_detected" + ), + "excluded_from_default_chunks": ( + "security-report.example.json: finding f0001 " + "excluded_from_default_chunks must be true for hidden_text_detected" + ), + } + suppressed_projection_diagnostics = ( + "security-report.example.json: missing warning-derived finding for hidden_text_detected", + "security-report.example.json: finding f0001 has no matching " + "security_warnings entry for hidden_text_detected", + ) + + for field in ("id", "message", "excluded_from_default_chunks"): + with self.subTest(field=field): + report = copy.deepcopy(self.report) + report["findings"][0].pop(field) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn(expected_diagnostics[field], diagnostics) + self.assertNotIn(suppressed_diagnostics[field], diagnostics) + if field in ("message", "excluded_from_default_chunks"): + for diagnostic in suppressed_projection_diagnostics: + self.assertNotIn(diagnostic, diagnostics) + def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["message"] = "hidden text changed" From 9f0a404a19814a2b51bf4811e74eb093011dad47 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:39:15 +0530 Subject: [PATCH 22/51] Validate security report unexpected fields Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 75 ++++++++++++++++++++++ schemas/test_security_report_validation.py | 55 ++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 30abef5..d297403 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -80,6 +80,30 @@ FINDING_REQUIRED_FIELDS = ("id", "message", "excluded_from_default_chunks") +REPORT_ALLOWED_FIELDS = REPORT_REQUIRED_FIELDS + +PROFILE_ALLOWED_FIELDS = PROFILE_REQUIRED_FIELDS + +FINDING_ALLOWED_FIELDS = ( + "id", + "code", + "message", + "page", + "element_ref", + "span_ref", + "bbox", + "text_preview", + "excluded_from_default_chunks", +) + +INVENTORY_ALLOWED_FIELDS = { + "annotations": ("page", "kind", "bbox", "supported"), + "actions": ("kind", "page", "target"), + "attachments": ("name", "bytes", "sha256"), + "scripts": ("location", "page", "trigger"), + "links": ("page", "uri", "external", "bbox"), +} + def diagnose_security_report_example( document, @@ -94,6 +118,7 @@ def diagnose_security_report_example( security_warnings = warning_items(payload.get("security_warnings", [])) parser_warnings = warning_items(payload.get("parser_warnings", [])) refs = document_reference_index(payload) + diagnose_report_allowed_fields(report, ctx, diagnostics) diagnose_report_required_fields(report, ctx, diagnostics) diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) @@ -173,6 +198,7 @@ def diagnose_security_report_example( ) diagnose_finding_required_fields(findings, ctx, diagnostics) + diagnose_finding_allowed_fields(findings, ctx, diagnostics) diagnose_finding_ids(findings, ctx, diagnostics) diagnose_finding_codes(findings, ctx, diagnostics) diagnose_finding_messages(findings, ctx, diagnostics) @@ -184,6 +210,7 @@ def diagnose_security_report_example( diagnostics.append(f"{ctx}: inventories must be an object") return diagnostics + diagnose_inventory_allowed_fields(inventories, ctx, diagnostics) inventory_lists = { name: inventory_items(inventories, name, ctx, diagnostics) for name in ("annotations", "actions", "attachments", "scripts", "links") @@ -348,6 +375,17 @@ def diagnose_report_required_fields(report, ctx, diagnostics): diagnostics.append(f"{ctx}: profile.{field} is required") +def diagnose_report_allowed_fields(report, ctx, diagnostics): + if not isinstance(report, dict): + return + diagnose_allowed_fields(report, REPORT_ALLOWED_FIELDS, None, ctx, diagnostics) + profile = report.get("profile") + if isinstance(profile, dict): + diagnose_allowed_fields( + profile, PROFILE_ALLOWED_FIELDS, "profile", ctx, diagnostics + ) + + def diagnose_finding_ids(findings, ctx, diagnostics): seen = set() for index, finding in enumerate(findings): @@ -377,6 +415,15 @@ def diagnose_finding_required_fields(findings, ctx, diagnostics): diagnostics.append(f"{ctx}: {item_ctx}.{field} is required") +def diagnose_finding_allowed_fields(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + diagnose_allowed_fields( + finding, FINDING_ALLOWED_FIELDS, finding_ctx(finding, index), ctx, diagnostics + ) + + def diagnose_finding_codes(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): @@ -469,6 +516,34 @@ def warning_locator_matches(finding, expected): return True +def diagnose_inventory_allowed_fields(inventories, ctx, diagnostics): + diagnose_allowed_fields( + inventories, INVENTORY_ALLOWED_FIELDS.keys(), "inventories", ctx, diagnostics + ) + for name, allowed_fields in INVENTORY_ALLOWED_FIELDS.items(): + items = inventories.get(name) + if not isinstance(items, list): + continue + for index, item in enumerate(items): + if not isinstance(item, dict): + continue + diagnose_allowed_fields( + item, + allowed_fields, + f"inventories.{name}[{index}]", + ctx, + diagnostics, + ) + + +def diagnose_allowed_fields(value, allowed_fields, item_ctx, ctx, diagnostics): + allowed = set(allowed_fields) + for field in sorted(value.keys()): + if field not in allowed: + path = field if item_ctx is None else f"{item_ctx}.{field}" + diagnostics.append(f"{ctx}: {path} is not allowed") + + def inventory_items(inventories, name, ctx, diagnostics): if name not in inventories: diagnostics.append(f"{ctx}: inventories.{name} is required") diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index a65953e..5285737 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -166,6 +166,22 @@ def test_profile_fields_are_required(self) -> None: ) self.assertNotIn(identity_diagnostics[field], diagnostics) + def test_unexpected_report_fields_fail_closed(self) -> None: + report = copy.deepcopy(self.report) + report["unexpected"] = True + report["profile"]["unexpected"] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: unexpected is not allowed", + diagnostics, + ) + self.assertIn( + "security-report.example.json: profile.unexpected is not allowed", + diagnostics, + ) + def test_finding_ids_must_be_contiguous_in_report_order(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["id"] = "f0004" @@ -265,6 +281,17 @@ def test_finding_required_fields_must_be_present(self) -> None: for diagnostic in suppressed_projection_diagnostics: self.assertNotIn(diagnostic, diagnostics) + def test_unexpected_finding_fields_fail_closed(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["unexpected"] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001.unexpected is not allowed", + diagnostics, + ) + def test_hidden_text_finding_message_must_match_fixed_template(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["message"] = "hidden text changed" @@ -920,6 +947,34 @@ def test_required_inventory_lanes_must_be_present(self) -> None: diagnostics, ) + def test_unexpected_inventory_fields_fail_closed(self) -> None: + inventory_items = { + "annotations": {"page": "p0001", "kind": "link"}, + "actions": {"kind": "uri"}, + "attachments": {"name": "attachment.bin", "bytes": 0}, + "scripts": {"location": "document"}, + "links": {"page": "p0001", "uri": "https://example.com/q3", "external": True}, + } + report = copy.deepcopy(self.report) + report["inventories"]["widgets"] = [] + for name, item in inventory_items.items(): + report["inventories"][name] = [copy.deepcopy(item)] + report["inventories"][name][0]["unexpected"] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.widgets is not allowed", + diagnostics, + ) + for name in inventory_items: + with self.subTest(name=name): + self.assertIn( + f"security-report.example.json: inventories.{name}[0].unexpected " + "is not allowed", + diagnostics, + ) + def test_required_inventory_item_fields_must_be_present(self) -> None: inventory_items = { "annotations": {"page": "p0001", "kind": "link"}, From cf5b6cb5cebac9f1e8b8d825f8e860aae0818d94 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:52:06 +0530 Subject: [PATCH 23/51] Reject boolean security report bbox coordinates Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 6 +++++- schemas/test_security_report_validation.py | 23 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index d297403..c242f6d 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -654,7 +654,7 @@ def check_bbox(bbox, page, refs, ctx, item_ctx, diagnostics): if ( not isinstance(bbox, list) or len(bbox) != 4 - or any(not isinstance(coord, int) for coord in bbox) + or any(not is_json_integer(coord) for coord in bbox) ): diagnostics.append(f"{ctx}: {item_ctx} bbox must be four integer coordinates") return @@ -709,6 +709,10 @@ def deterministic_preview(text): return text[:120] + "\u2026" +def is_json_integer(value): + return isinstance(value, int) and not isinstance(value, bool) + + def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): element_ref = item.get("element_ref") if element_ref is None: diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 5285737..96eb6af 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -687,6 +687,17 @@ def test_finding_bbox_must_have_positive_area(self) -> None: diagnostics, ) + def test_finding_bbox_rejects_boolean_coordinates(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["bbox"][0] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 bbox must be four integer coordinates", + diagnostics, + ) + def test_finding_bbox_must_stay_inside_page_bounds(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["bbox"][2] = 61201 @@ -871,6 +882,18 @@ def test_inventory_bbox_must_have_positive_area(self) -> None: diagnostics, ) + def test_inventory_bbox_rejects_boolean_coordinates(self) -> None: + report = copy.deepcopy(self.report) + report["inventories"]["links"][0]["bbox"][0] = True + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.links[0] bbox must be four " + "integer coordinates", + diagnostics, + ) + def test_inventory_bbox_must_stay_inside_page_bounds(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["annotations"][0]["bbox"][3] = 79201 From 8f9e359e33c9f892e19f8be5d699bb2edcc3c99b Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 20:57:34 +0530 Subject: [PATCH 24/51] Validate security report summary counts Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 9 +++++++++ schemas/test_security_report_validation.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index c242f6d..b663d3c 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -132,6 +132,7 @@ def diagnose_security_report_example( if not isinstance(summary, dict): diagnostics.append(f"{ctx}: summary must be an object") return diagnostics + diagnose_summary_counts(summary, ctx, diagnostics) warning_derived_findings = [ projected_warning_finding(warning) @@ -375,6 +376,14 @@ def diagnose_report_required_fields(report, ctx, diagnostics): diagnostics.append(f"{ctx}: profile.{field} is required") +def diagnose_summary_counts(summary, ctx, diagnostics): + for code, count in summary.items(): + if not is_json_integer(count) or count < 0: + diagnostics.append( + f"{ctx}: summary.{code} must be a non-negative integer" + ) + + def diagnose_report_allowed_fields(report, ctx, diagnostics): if not isinstance(report, dict): return diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 96eb6af..df334ff 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -466,6 +466,20 @@ def test_summary_must_match_all_report_finding_counts(self) -> None: diagnostics, ) + def test_summary_counts_must_be_non_negative_json_integers(self) -> None: + for value in (True, "1", -1): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["summary"]["hidden_text_detected"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: summary.hidden_text_detected " + "must be a non-negative integer", + diagnostics, + ) + def test_zero_count_summary_keys_must_be_omitted(self) -> None: report = copy.deepcopy(self.report) report["summary"]["image_only_page"] = 0 From 9c70fafdacefc6a14c28fd0fd2fa40543a3e8643 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:01:27 +0530 Subject: [PATCH 25/51] Validate security report attachment byte counts Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 13 +++++++++++++ schemas/test_security_report_validation.py | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index b663d3c..e6b5a08 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -217,6 +217,7 @@ def diagnose_security_report_example( for name in ("annotations", "actions", "attachments", "scripts", "links") } diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics) + diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics) annotations = inventory_lists["annotations"] links = inventory_lists["links"] unsupported_annotations = [ @@ -576,6 +577,18 @@ def diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics): ) +def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): + for index, item in enumerate(inventory_lists.get("attachments", [])): + if not isinstance(item, dict) or "bytes" not in item: + continue + bytes_value = item.get("bytes") + if not is_json_integer(bytes_value) or bytes_value < 0: + diagnostics.append( + f"{ctx}: inventories.attachments[{index}].bytes must be a " + "non-negative integer" + ) + + def document_reference_index(payload): if not isinstance(payload, dict): return {"pages": {}, "elements": {}, "spans": {}} diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index df334ff..e18b69a 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1043,6 +1043,22 @@ def test_required_inventory_item_fields_must_be_present(self) -> None: diagnostics, ) + def test_attachment_inventory_bytes_must_be_non_negative_json_integer(self) -> None: + for value in (True, False, "1", 1.0, -1): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["inventories"]["attachments"] = [ + {"name": "attachment.bin", "bytes": value} + ] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.attachments[0].bytes " + "must be a non-negative integer", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From 154aad6366b6c98931b9f8f3f3ad519b909eb981 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:06:23 +0530 Subject: [PATCH 26/51] Validate security report boolean fields Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 26 ++++++++++++++++ schemas/test_security_report_validation.py | 35 ++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index e6b5a08..de93b4f 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -475,6 +475,12 @@ def diagnose_finding_exclusion_flags(findings, ctx, diagnostics): continue if "excluded_from_default_chunks" not in finding: continue + if not is_json_boolean(finding.get("excluded_from_default_chunks")): + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)} " + "excluded_from_default_chunks must be a boolean" + ) + continue expected = code in DEFAULT_CHUNK_EXCLUDED_CODES if finding.get("excluded_from_default_chunks") != expected: diagnostics.append( @@ -578,6 +584,14 @@ def diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics): def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): + for index, item in enumerate(inventory_lists.get("annotations", [])): + if not isinstance(item, dict) or "supported" not in item: + continue + if not is_json_boolean(item.get("supported")): + diagnostics.append( + f"{ctx}: inventories.annotations[{index}].supported must be a boolean" + ) + for index, item in enumerate(inventory_lists.get("attachments", [])): if not isinstance(item, dict) or "bytes" not in item: continue @@ -588,6 +602,14 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): "non-negative integer" ) + for index, item in enumerate(inventory_lists.get("links", [])): + if not isinstance(item, dict) or "external" not in item: + continue + if not is_json_boolean(item.get("external")): + diagnostics.append( + f"{ctx}: inventories.links[{index}].external must be a boolean" + ) + def document_reference_index(payload): if not isinstance(payload, dict): @@ -735,6 +757,10 @@ def is_json_integer(value): return isinstance(value, int) and not isinstance(value, bool) +def is_json_boolean(value): + return isinstance(value, bool) + + def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): element_ref = item.get("element_ref") if element_ref is None: diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index e18b69a..cbd961a 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -597,6 +597,21 @@ def test_non_exclusion_finding_codes_must_not_be_default_excluded(self) -> None: diagnostics, ) + def test_excluded_from_default_chunks_must_be_boolean(self) -> None: + for index, value in ((0, 1), (1, 0), (0, "true")): + with self.subTest(index=index, value=value): + report = copy.deepcopy(self.report) + report["findings"][index]["excluded_from_default_chunks"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + finding_id = report["findings"][index]["id"] + self.assertIn( + f"security-report.example.json: finding {finding_id} " + "excluded_from_default_chunks must be a boolean", + diagnostics, + ) + def test_finding_page_refs_must_exist_in_document(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["page"] = "p9999" @@ -1059,6 +1074,26 @@ def test_attachment_inventory_bytes_must_be_non_negative_json_integer(self) -> N diagnostics, ) + def test_inventory_boolean_fields_must_be_boolean(self) -> None: + cases = ( + ("annotations", "supported", 1), + ("annotations", "supported", "false"), + ("links", "external", 1), + ("links", "external", "true"), + ) + for name, field, value in cases: + with self.subTest(name=name, field=field, value=value): + report = copy.deepcopy(self.report) + report["inventories"][name][0][field] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0].{field} " + "must be a boolean", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From e6a87f2d716f29f87b8b0f2c78213540c6e79cb5 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:10:55 +0530 Subject: [PATCH 27/51] Validate security report script locations Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 11 +++++++++++ schemas/test_security_report_validation.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index de93b4f..050eb1d 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -104,6 +104,8 @@ "links": ("page", "uri", "external", "bbox"), } +SCRIPT_LOCATIONS = {"document", "page", "annotation", "field", "other"} + def diagnose_security_report_example( document, @@ -602,6 +604,15 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): "non-negative integer" ) + for index, item in enumerate(inventory_lists.get("scripts", [])): + if not isinstance(item, dict) or "location" not in item: + continue + if item.get("location") not in SCRIPT_LOCATIONS: + diagnostics.append( + f"{ctx}: inventories.scripts[{index}].location must be a " + "supported script location" + ) + for index, item in enumerate(inventory_lists.get("links", [])): if not isinstance(item, dict) or "external" not in item: continue diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index cbd961a..d06f8f8 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1094,6 +1094,20 @@ def test_inventory_boolean_fields_must_be_boolean(self) -> None: diagnostics, ) + def test_script_inventory_location_must_be_supported(self) -> None: + for value in ("widget", "", 7, None): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["inventories"]["scripts"] = [{"location": value}] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.scripts[0].location " + "must be a supported script location", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From fa1a4fb518b922ad7a677fda364e5b492da45a8c Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:15:29 +0530 Subject: [PATCH 28/51] Validate security report attachment digests Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 14 ++++++++++ schemas/test_security_report_validation.py | 30 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 050eb1d..5832057 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -105,6 +105,7 @@ } SCRIPT_LOCATIONS = {"document", "page", "annotation", "field", "other"} +LOWER_HEX_DIGITS = set("0123456789abcdef") def diagnose_security_report_example( @@ -603,6 +604,11 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): f"{ctx}: inventories.attachments[{index}].bytes must be a " "non-negative integer" ) + if "sha256" in item and not is_lower_hex_sha256(item.get("sha256")): + diagnostics.append( + f"{ctx}: inventories.attachments[{index}].sha256 must be a " + "64-character lowercase hex digest" + ) for index, item in enumerate(inventory_lists.get("scripts", [])): if not isinstance(item, dict) or "location" not in item: @@ -772,6 +778,14 @@ def is_json_boolean(value): return isinstance(value, bool) +def is_lower_hex_sha256(value): + return ( + isinstance(value, str) + and len(value) == 64 + and all(char in LOWER_HEX_DIGITS for char in value) + ) + + def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): element_ref = item.get("element_ref") if element_ref is None: diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d06f8f8..6c1c9e6 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1074,6 +1074,36 @@ def test_attachment_inventory_bytes_must_be_non_negative_json_integer(self) -> N diagnostics, ) + def test_attachment_inventory_sha256_must_be_lowercase_hex_digest(self) -> None: + for value in ("abc", "g" * 64, "A" * 64, 64, None): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["inventories"]["attachments"] = [ + {"name": "attachment.bin", "bytes": 0, "sha256": value} + ] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.attachments[0].sha256 " + "must be a 64-character lowercase hex digest", + diagnostics, + ) + + def test_attachment_inventory_sha256_accepts_lowercase_hex_digest(self) -> None: + report = copy.deepcopy(self.report) + report["inventories"]["attachments"] = [ + {"name": "attachment.bin", "bytes": 0, "sha256": "a" * 64} + ] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertNotIn( + "security-report.example.json: inventories.attachments[0].sha256 " + "must be a 64-character lowercase hex digest", + diagnostics, + ) + def test_inventory_boolean_fields_must_be_boolean(self) -> None: cases = ( ("annotations", "supported", 1), From d22fa76d3c938c2d06721c1652f2a1ca56dcb1b6 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:21:25 +0530 Subject: [PATCH 29/51] Validate security report array items Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 8 +++++++ schemas/test_security_report_validation.py | 27 ++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 5832057..9eac2e7 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -131,6 +131,7 @@ def diagnose_security_report_example( if not isinstance(findings, list): diagnostics.append(f"{ctx}: findings must be an array") return diagnostics + diagnose_array_item_objects(findings, "findings", ctx, diagnostics) summary = report.get("summary") if isinstance(report, dict) else {} if not isinstance(summary, dict): diagnostics.append(f"{ctx}: summary must be an object") @@ -571,9 +572,16 @@ def inventory_items(inventories, name, ctx, diagnostics): if not isinstance(items, list): diagnostics.append(f"{ctx}: inventories.{name} must be an array") return [] + diagnose_array_item_objects(items, f"inventories.{name}", ctx, diagnostics) return items +def diagnose_array_item_objects(items, path, ctx, diagnostics): + for index, item in enumerate(items): + if not isinstance(item, dict): + diagnostics.append(f"{ctx}: {path}[{index}] must be an object") + + def diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics): for name, required_fields in INVENTORY_REQUIRED_FIELDS.items(): for index, item in enumerate(inventory_lists.get(name, [])): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 6c1c9e6..bb7038c 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -129,6 +129,19 @@ def test_top_level_report_fields_are_required(self) -> None: if field in identity_diagnostics: self.assertNotIn(identity_diagnostics[field], diagnostics) + def test_finding_items_must_be_objects(self) -> None: + for value in ("bad", [], None): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["findings"].append(value) + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: findings[3] must be an object", + diagnostics, + ) + def test_profile_must_be_object(self) -> None: report = copy.deepcopy(self.report) report["profile"] = [] @@ -986,6 +999,20 @@ def test_inventory_shape_must_be_deterministic_arrays(self) -> None: diagnostics, ) + def test_inventory_items_must_be_objects(self) -> None: + for name in ("annotations", "actions", "attachments", "scripts", "links"): + with self.subTest(name=name): + report = copy.deepcopy(self.report) + report["inventories"][name] = ["bad"] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0] " + "must be an object", + diagnostics, + ) + def test_required_inventory_lanes_must_be_present(self) -> None: for name in ("annotations", "actions", "attachments", "scripts", "links"): with self.subTest(name=name): From 3f6ad27edbd4d93419edafd1d7a834ffa8de0d3b Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:27:06 +0530 Subject: [PATCH 30/51] Validate security report inventory strings Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 19 ++++++++++++++ schemas/test_security_report_validation.py | 30 ++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 9eac2e7..b309860 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -104,6 +104,14 @@ "links": ("page", "uri", "external", "bbox"), } +INVENTORY_STRING_FIELDS = { + "annotations": ("kind",), + "actions": ("kind", "target"), + "attachments": ("name",), + "scripts": ("trigger",), + "links": ("uri",), +} + SCRIPT_LOCATIONS = {"document", "page", "annotation", "field", "other"} LOWER_HEX_DIGITS = set("0123456789abcdef") @@ -595,6 +603,17 @@ def diagnose_inventory_required_fields(inventory_lists, ctx, diagnostics): def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): + for name, fields in INVENTORY_STRING_FIELDS.items(): + for index, item in enumerate(inventory_lists.get(name, [])): + if not isinstance(item, dict): + continue + for field in fields: + if field in item and not isinstance(item.get(field), str): + diagnostics.append( + f"{ctx}: inventories.{name}[{index}].{field} " + "must be a string" + ) + for index, item in enumerate(inventory_lists.get("annotations", [])): if not isinstance(item, dict) or "supported" not in item: continue diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index bb7038c..d56dc3c 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1101,6 +1101,36 @@ def test_attachment_inventory_bytes_must_be_non_negative_json_integer(self) -> N diagnostics, ) + def test_inventory_string_fields_must_be_strings(self) -> None: + inventory_items = { + "annotations": {"page": "p0001", "kind": "link"}, + "actions": {"kind": "uri", "target": "https://example.com/q3"}, + "attachments": {"name": "attachment.bin", "bytes": 0}, + "scripts": {"location": "document", "trigger": "open"}, + "links": {"page": "p0001", "uri": "https://example.com/q3", "external": True}, + } + cases = ( + ("annotations", "kind", 7), + ("actions", "kind", False), + ("actions", "target", ["https://example.com/q3"]), + ("attachments", "name", None), + ("scripts", "trigger", 7), + ("links", "uri", ["https://example.com/q3"]), + ) + for name, field, value in cases: + with self.subTest(name=name, field=field, value=value): + report = copy.deepcopy(self.report) + report["inventories"][name] = [copy.deepcopy(inventory_items[name])] + report["inventories"][name][0][field] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0].{field} " + "must be a string", + diagnostics, + ) + def test_attachment_inventory_sha256_must_be_lowercase_hex_digest(self) -> None: for value in ("abc", "g" * 64, "A" * 64, 64, None): with self.subTest(value=value): From df77b2733e905879b559b372c951bdd8badc73a3 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 21:50:11 +0530 Subject: [PATCH 31/51] Validate security report identity patterns Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 106 ++++++++++++++++++++- schemas/test_security_report_validation.py | 80 ++++++++++++++++ 2 files changed, 185 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index b309860..38d469f 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -114,6 +114,7 @@ SCRIPT_LOCATIONS = {"document", "page", "annotation", "field", "other"} LOWER_HEX_DIGITS = set("0123456789abcdef") +ASCII_DIGITS = set("0123456789") def diagnose_security_report_example( @@ -131,6 +132,7 @@ def diagnose_security_report_example( refs = document_reference_index(payload) diagnose_report_allowed_fields(report, ctx, diagnostics) diagnose_report_required_fields(report, ctx, diagnostics) + diagnose_report_identity_scalar_fields(report, ctx, diagnostics) diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) @@ -367,10 +369,69 @@ def diagnose_report_identity(document, report, ctx, diagnostics): ), ) for key, want, actual_present, actual in comparisons: - if want is not None and actual_present and actual != want: + if ( + want is not None + and actual_present + and report_identity_scalar_valid(key, actual) + and actual != want + ): diagnostics.append(f"{ctx}: {key} diverges from document example") +def diagnose_report_identity_scalar_fields(report, ctx, diagnostics): + if not isinstance(report, dict): + return + scalar_checks = ( + ( + "schema_version", + report.get("schema_version"), + "schema_version" in report, + is_numeric_version, + r"must match pattern ^[0-9]+\.[0-9]+\.[0-9]+$", + ), + ( + "document_fingerprint", + report.get("document_fingerprint"), + "document_fingerprint" in report, + is_sha256_fingerprint, + "must match pattern ^sha256:[0-9a-f]{64}$", + ), + ( + "source_fingerprint", + report.get("source_fingerprint"), + "source_fingerprint" in report, + is_sha256_fingerprint, + "must match pattern ^sha256:[0-9a-f]{64}$", + ), + ) + for key, value, present, predicate, message in scalar_checks: + if present and not predicate(value): + diagnostics.append(f"{ctx}: {key} {message}") + + profile = report.get("profile") + if not isinstance(profile, dict): + return + profile_checks = ( + ( + "profile.id", + profile.get("id"), + "id" in profile, + is_deterministic_profile_id, + "must match pattern ^ethos-deterministic-v[0-9]+$", + ), + ( + "profile.sha256", + profile.get("sha256"), + "sha256" in profile, + is_lower_hex_sha256, + "must match pattern ^[0-9a-f]{64}$", + ), + ) + for key, value, present, predicate, message in profile_checks: + if present and not predicate(value): + diagnostics.append(f"{ctx}: {key} {message}") + + def diagnose_report_required_fields(report, ctx, diagnostics): if not isinstance(report, dict): diagnostics.append(f"{ctx}: report must be an object") @@ -813,6 +874,49 @@ def is_lower_hex_sha256(value): ) +def is_numeric_version(value): + if not isinstance(value, str): + return False + parts = value.split(".") + return len(parts) == 3 and all(is_ascii_digits(part) for part in parts) + + +def is_sha256_fingerprint(value): + return ( + isinstance(value, str) + and value.startswith("sha256:") + and is_lower_hex_sha256(value[len("sha256:") :]) + ) + + +def is_deterministic_profile_id(value): + prefix = "ethos-deterministic-v" + return ( + isinstance(value, str) + and value.startswith(prefix) + and is_ascii_digits(value[len(prefix) :]) + ) + + +def report_identity_scalar_valid(key, value): + checks = { + "schema_version": is_numeric_version, + "document_fingerprint": is_sha256_fingerprint, + "source_fingerprint": is_sha256_fingerprint, + "profile.id": is_deterministic_profile_id, + "profile.sha256": is_lower_hex_sha256, + } + return checks[key](value) + + +def is_ascii_digits(value): + return ( + isinstance(value, str) + and value != "" + and all(char in ASCII_DIGITS for char in value) + ) + + def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): element_ref = item.get("element_ref") if element_ref is None: diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d56dc3c..d008a13 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -93,6 +93,86 @@ def test_profile_identity_must_match_document_profile(self) -> None: diagnostics, ) + def test_report_identity_scalar_fields_must_match_schema_patterns(self) -> None: + cases = ( + ( + ("schema_version",), + "1.0", + r"security-report.example.json: schema_version must match " + r"pattern ^[0-9]+\.[0-9]+\.[0-9]+$", + "security-report.example.json: schema_version diverges from " + "document example", + ), + ( + ("schema_version",), + 100, + r"security-report.example.json: schema_version must match " + r"pattern ^[0-9]+\.[0-9]+\.[0-9]+$", + "security-report.example.json: schema_version diverges from " + "document example", + ), + ( + ("document_fingerprint",), + "sha256:" + ("g" * 64), + "security-report.example.json: document_fingerprint must match " + "pattern ^sha256:[0-9a-f]{64}$", + "security-report.example.json: document_fingerprint diverges from " + "document example", + ), + ( + ("document_fingerprint",), + None, + "security-report.example.json: document_fingerprint must match " + "pattern ^sha256:[0-9a-f]{64}$", + "security-report.example.json: document_fingerprint diverges from " + "document example", + ), + ( + ("source_fingerprint",), + "5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef", + "security-report.example.json: source_fingerprint must match " + "pattern ^sha256:[0-9a-f]{64}$", + "security-report.example.json: source_fingerprint diverges from " + "document example", + ), + ( + ("profile", "id"), + "ethos-deterministic-v", + "security-report.example.json: profile.id must match " + "pattern ^ethos-deterministic-v[0-9]+$", + "security-report.example.json: profile.id diverges from " + "document example", + ), + ( + ("profile", "id"), + None, + "security-report.example.json: profile.id must match " + "pattern ^ethos-deterministic-v[0-9]+$", + "security-report.example.json: profile.id diverges from " + "document example", + ), + ( + ("profile", "sha256"), + "A" * 64, + "security-report.example.json: profile.sha256 must match " + "pattern ^[0-9a-f]{64}$", + "security-report.example.json: profile.sha256 diverges from " + "document example", + ), + ) + for path, value, expected_diagnostic, divergence_diagnostic in cases: + with self.subTest(path=".".join(path)): + report = copy.deepcopy(self.report) + target = report + for key in path[:-1]: + target = target[key] + target[path[-1]] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn(expected_diagnostic, diagnostics) + self.assertNotIn(divergence_diagnostic, diagnostics) + def test_report_must_be_object(self) -> None: diagnostics = diagnose_security_report_example(self.document, []) From 5ad12627ebf3a77c4b048285e7645fd125958625 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 22:00:06 +0530 Subject: [PATCH 32/51] Validate security report finding strings Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 28 +++++++++++++++++--- schemas/test_security_report_validation.py | 30 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 38d469f..537f51c 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -96,6 +96,8 @@ "excluded_from_default_chunks", ) +FINDING_STRING_FIELDS = ("message", "text_preview") + INVENTORY_ALLOWED_FIELDS = { "annotations": ("page", "kind", "bbox", "supported"), "actions": ("kind", "page", "target"), @@ -216,6 +218,7 @@ def diagnose_security_report_example( diagnose_finding_allowed_fields(findings, ctx, diagnostics) diagnose_finding_ids(findings, ctx, diagnostics) diagnose_finding_codes(findings, ctx, diagnostics) + diagnose_finding_scalar_fields(findings, ctx, diagnostics) diagnose_finding_messages(findings, ctx, diagnostics) diagnose_finding_exclusion_flags(findings, ctx, diagnostics) diagnose_findings_references(findings, refs, ctx, diagnostics) @@ -522,6 +525,17 @@ def diagnose_finding_codes(findings, ctx, diagnostics): ) +def diagnose_finding_scalar_fields(findings, ctx, diagnostics): + for index, finding in enumerate(findings): + if not isinstance(finding, dict): + continue + for field in FINDING_STRING_FIELDS: + if field in finding and not isinstance(finding.get(field), str): + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)}.{field} must be a string" + ) + + def diagnose_finding_messages(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): @@ -533,6 +547,8 @@ def diagnose_finding_messages(findings, ctx, diagnostics): if "message" not in finding: continue actual_message = finding.get("message") + if not isinstance(actual_message, str): + continue if actual_message != expected_message: diagnostics.append( f"{ctx}: {finding_ctx(finding, index)} message must match fixed template for {code}" @@ -846,10 +862,14 @@ def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): diagnostics.append( f"{ctx}: {item_ctx} span_ref {span_ref} requires text_preview" ) - elif finding.get("text_preview") != expected_preview: - diagnostics.append( - f"{ctx}: {item_ctx} text_preview must match span_ref {span_ref} text" - ) + else: + text_preview = finding.get("text_preview") + if not isinstance(text_preview, str): + return + if text_preview != expected_preview: + diagnostics.append( + f"{ctx}: {item_ctx} text_preview must match span_ref {span_ref} text" + ) def deterministic_preview(text): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d008a13..d404c48 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -374,6 +374,36 @@ def test_finding_required_fields_must_be_present(self) -> None: for diagnostic in suppressed_projection_diagnostics: self.assertNotIn(diagnostic, diagnostics) + def test_finding_string_fields_must_be_strings(self) -> None: + cases = ( + ( + 0, + "message", + 7, + "security-report.example.json: finding f0001.message must be a string", + "security-report.example.json: finding f0001 message must match " + "fixed template for hidden_text_detected", + ), + ( + 0, + "text_preview", + ["internal-draft-do-not-cite"], + "security-report.example.json: finding f0001.text_preview " + "must be a string", + "security-report.example.json: finding f0001 text_preview must match " + "span_ref s000003 text", + ), + ) + for index, field, value, expected_diagnostic, suppressed_diagnostic in cases: + with self.subTest(field=field, value=value): + report = copy.deepcopy(self.report) + report["findings"][index][field] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn(expected_diagnostic, diagnostics) + self.assertNotIn(suppressed_diagnostic, diagnostics) + def test_unexpected_finding_fields_fail_closed(self) -> None: report = copy.deepcopy(self.report) report["findings"][0]["unexpected"] = True From 439151a128d893da60578618d5f3f78cde4c9e68 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 22:11:43 +0530 Subject: [PATCH 33/51] Validate security report page locators Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 39 ++++++++++++++-- schemas/test_security_report_validation.py | 54 ++++++++++++++++++++++ 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 537f51c..3e1f248 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -758,15 +758,18 @@ def diagnose_findings_references(findings, refs, ctx, diagnostics): continue item_ctx = finding_ctx(finding, index) page = finding.get("page") + page_shape_valid = True if page is not None: - check_page_ref(page, refs, ctx, item_ctx, diagnostics) + page_shape_valid = check_page_shape(page, ctx, item_ctx, diagnostics) + if page_shape_valid: + check_page_ref(page, refs, ctx, item_ctx, diagnostics) check_locator_ref( finding, "element_ref", "elements", refs, ctx, item_ctx, diagnostics ) check_locator_ref( finding, "span_ref", "spans", refs, ctx, item_ctx, diagnostics ) - if "bbox" in finding: + if "bbox" in finding and page_shape_valid: check_bbox(finding.get("bbox"), page, refs, ctx, item_ctx, diagnostics) check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics) @@ -778,9 +781,12 @@ def diagnose_inventory_references(inventory_lists, refs, ctx, diagnostics): continue item_ctx = f"inventories.{name}[{index}]" page = item.get("page") + page_shape_valid = True if page is not None: - check_page_ref(page, refs, ctx, item_ctx, diagnostics) - if "bbox" in item: + page_shape_valid = check_page_shape(page, ctx, item_ctx, diagnostics) + if page_shape_valid: + check_page_ref(page, refs, ctx, item_ctx, diagnostics) + if "bbox" in item and page_shape_valid: check_bbox(item.get("bbox"), page, refs, ctx, item_ctx, diagnostics) @@ -794,7 +800,12 @@ def check_locator_ref(item, key, ref_kind, refs, ctx, item_ctx, diagnostics): return page = item.get("page") target_page = target.get("page") if isinstance(target, dict) else None - if page is not None and target_page is not None and page != target_page: + if ( + page is not None + and is_page_ref(page) + and target_page is not None + and page != target_page + ): diagnostics.append( f"{ctx}: {item_ctx} {key} {ref} page {target_page} does not match page {page}" ) @@ -809,6 +820,15 @@ def check_page_ref(page, refs, ctx, item_ctx, diagnostics): return refs["pages"][page] +def check_page_shape(page, ctx, item_ctx, diagnostics): + if not is_page_ref(page): + diagnostics.append( + f"{ctx}: {item_ctx}.page must match pattern ^p[0-9]{{4}}$" + ) + return False + return True + + def check_bbox(bbox, page, refs, ctx, item_ctx, diagnostics): if page is None: diagnostics.append(f"{ctx}: {item_ctx} bbox requires page") @@ -929,6 +949,15 @@ def report_identity_scalar_valid(key, value): return checks[key](value) +def is_page_ref(value): + return ( + isinstance(value, str) + and len(value) == 5 + and value.startswith("p") + and is_ascii_digits(value[1:]) + ) + + def is_ascii_digits(value): return ( isinstance(value, str) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d404c48..8f137d5 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -746,6 +746,29 @@ def test_finding_page_refs_must_exist_in_document(self) -> None: diagnostics, ) + def test_finding_page_refs_must_match_schema_pattern(self) -> None: + for value in ("page-1", []): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["findings"][1]["page"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0002.page must match " + "pattern ^p[0-9]{4}$", + diagnostics, + ) + self.assertFalse( + any( + diagnostic.startswith( + "security-report.example.json: finding f0002 " + "references unknown page" + ) + for diagnostic in diagnostics + ) + ) + def test_finding_element_refs_must_exist_in_document(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["element_ref"] = "e999999" @@ -1010,6 +1033,37 @@ def test_inventory_page_refs_must_exist_in_document(self) -> None: diagnostics, ) + def test_inventory_page_refs_must_match_schema_pattern(self) -> None: + cases = ( + ("annotations", "page-1"), + ("actions", []), + ("scripts", "1"), + ("links", []), + ) + for name, value in cases: + with self.subTest(name=name, value=value): + report = copy.deepcopy(self.report) + if not report["inventories"][name]: + report["inventories"][name] = [{"location": "document"}] + report["inventories"][name][0]["page"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0].page " + "must match pattern ^p[0-9]{4}$", + diagnostics, + ) + self.assertFalse( + any( + diagnostic.startswith( + f"security-report.example.json: inventories.{name}[0] " + "references unknown page" + ) + for diagnostic in diagnostics + ) + ) + def test_inventory_bbox_must_have_page(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["links"][0].pop("page") From a06a8422805eeaa4832e3b922804134aa4648eee Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 23:14:18 +0530 Subject: [PATCH 34/51] Validate security report locator refs Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 39 ++++++++++++- schemas/test_security_report_validation.py | 64 ++++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 3e1f248..dd11120 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -794,6 +794,8 @@ def check_locator_ref(item, key, ref_kind, refs, ctx, item_ctx, diagnostics): ref = item.get(key) if ref is None: return + if not check_locator_shape(ref, key, ctx, item_ctx, diagnostics): + return target = refs[ref_kind].get(ref) if target is None: diagnostics.append(f"{ctx}: {item_ctx} references unknown {key} {ref}") @@ -829,6 +831,21 @@ def check_page_shape(page, ctx, item_ctx, diagnostics): return True +def check_locator_shape(ref, key, ctx, item_ctx, diagnostics): + if key == "element_ref": + pattern = "^e[0-9]{6}$" + valid = is_element_ref(ref) + elif key == "span_ref": + pattern = "^s[0-9]{6}$" + valid = is_span_ref(ref) + else: + return True + if not valid: + diagnostics.append(f"{ctx}: {item_ctx}.{key} must match pattern {pattern}") + return False + return True + + def check_bbox(bbox, page, refs, ctx, item_ctx, diagnostics): if page is None: diagnostics.append(f"{ctx}: {item_ctx} bbox requires page") @@ -865,6 +882,8 @@ def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): f"{ctx}: {item_ctx} requires span_ref for {finding.get('code')}" ) return + if not is_span_ref(span_ref): + return span = refs["spans"].get(span_ref) if not isinstance(span, dict): return @@ -958,6 +977,24 @@ def is_page_ref(value): ) +def is_element_ref(value): + return ( + isinstance(value, str) + and len(value) == 7 + and value.startswith("e") + and is_ascii_digits(value[1:]) + ) + + +def is_span_ref(value): + return ( + isinstance(value, str) + and len(value) == 7 + and value.startswith("s") + and is_ascii_digits(value[1:]) + ) + + def is_ascii_digits(value): return ( isinstance(value, str) @@ -968,7 +1005,7 @@ def is_ascii_digits(value): def check_element_span_ownership(item, refs, ctx, item_ctx, span_ref, diagnostics): element_ref = item.get("element_ref") - if element_ref is None: + if element_ref is None or not is_element_ref(element_ref): return element = refs["elements"].get(element_ref) if not isinstance(element, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 8f137d5..2432450 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -780,6 +780,70 @@ def test_finding_element_refs_must_exist_in_document(self) -> None: diagnostics, ) + def test_finding_element_refs_must_match_schema_pattern(self) -> None: + for value in ("element-1", []): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["findings"][1]["element_ref"] = value + report["findings"][1]["span_ref"] = "s000003" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0002.element_ref " + "must match pattern ^e[0-9]{6}$", + diagnostics, + ) + self.assertFalse( + any( + diagnostic.startswith( + "security-report.example.json: finding f0002 " + "references unknown element_ref" + ) + for diagnostic in diagnostics + ) + ) + self.assertFalse( + any( + "owned by element_ref" in diagnostic + for diagnostic in diagnostics + ) + ) + + def test_finding_span_refs_must_exist_in_document(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["span_ref"] = "s999999" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 references unknown span_ref s999999", + diagnostics, + ) + + def test_finding_span_refs_must_match_schema_pattern(self) -> None: + for value in ("span-1", []): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["findings"][0]["span_ref"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001.span_ref " + "must match pattern ^s[0-9]{6}$", + diagnostics, + ) + self.assertFalse( + any( + diagnostic.startswith( + "security-report.example.json: finding f0001 " + "references unknown span_ref" + ) + for diagnostic in diagnostics + ) + ) + def test_finding_span_refs_must_match_finding_page(self) -> None: document = copy.deepcopy(self.document) document["payload"]["pages"].append( From 198fe3f844e155ab9bcd17bbc9d2e70369f3793f Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 23:23:55 +0530 Subject: [PATCH 35/51] Validate security report finding ids Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 21 +++++++++++++++++---- schemas/test_security_report_validation.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index dd11120..83fadec 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -480,15 +480,19 @@ def diagnose_finding_ids(findings, ctx, diagnostics): if "id" not in finding: continue finding_id = finding.get("id") + if not is_finding_id(finding_id): + diagnostics.append( + f"{ctx}: findings[{index}].id must match pattern ^f[0-9]{{4}}$" + ) + continue expected_id = f"f{index + 1:04d}" if finding_id != expected_id: diagnostics.append( f"{ctx}: findings[{index}].id must be {expected_id} for deterministic numbering" ) - if isinstance(finding_id, str): - if finding_id in seen: - diagnostics.append(f"{ctx}: duplicate finding id {finding_id}") - seen.add(finding_id) + if finding_id in seen: + diagnostics.append(f"{ctx}: duplicate finding id {finding_id}") + seen.add(finding_id) def diagnose_finding_required_fields(findings, ctx, diagnostics): @@ -995,6 +999,15 @@ def is_span_ref(value): ) +def is_finding_id(value): + return ( + isinstance(value, str) + and len(value) == 5 + and value.startswith("f") + and is_ascii_digits(value[1:]) + ) + + def is_ascii_digits(value): return ( isinstance(value, str) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 2432450..7995783 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -287,6 +287,25 @@ def test_finding_ids_must_be_contiguous_in_report_order(self) -> None: diagnostics, ) + def test_finding_ids_must_match_schema_pattern(self) -> None: + for value in ("finding-1", "f001", "F0001", []): + with self.subTest(value=value): + report = copy.deepcopy(self.report) + report["findings"][0]["id"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: findings[0].id must match " + "pattern ^f[0-9]{4}$", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: findings[0].id must be f0001 " + "for deterministic numbering", + diagnostics, + ) + def test_finding_ids_must_be_unique(self) -> None: report = copy.deepcopy(self.report) report["findings"][1]["id"] = "f0001" From 523a9b0b2a359abf5765dfc5940834134f9fddc5 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 23:26:47 +0530 Subject: [PATCH 36/51] Harden security report script locations Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 3 ++- schemas/test_security_report_validation.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 83fadec..a1b7764 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -721,7 +721,8 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): for index, item in enumerate(inventory_lists.get("scripts", [])): if not isinstance(item, dict) or "location" not in item: continue - if item.get("location") not in SCRIPT_LOCATIONS: + location = item.get("location") + if not isinstance(location, str) or location not in SCRIPT_LOCATIONS: diagnostics.append( f"{ctx}: inventories.scripts[{index}].location must be a " "supported script location" diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 7995783..458bb18 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1429,7 +1429,7 @@ def test_inventory_boolean_fields_must_be_boolean(self) -> None: ) def test_script_inventory_location_must_be_supported(self) -> None: - for value in ("widget", "", 7, None): + for value in ("widget", "", 7, None, [], {}): with self.subTest(value=value): report = copy.deepcopy(self.report) report["inventories"]["scripts"] = [{"location": value}] From 0a15c97bdc430f430e938806ff13ccf67ef44b85 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 23:44:54 +0530 Subject: [PATCH 37/51] Harden security report code diagnostics Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 42 ++++++++++++++----- schemas/test_security_report_validation.py | 47 +++++++++++++++++++++- 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index a1b7764..234ac7d 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -153,7 +153,11 @@ def diagnose_security_report_example( warning_derived_findings = [ projected_warning_finding(warning) for warning in security_warnings - if isinstance(warning, dict) and warning.get("code") in REPORTABLE_WARNING_CODES + if ( + isinstance(warning, dict) + and isinstance(warning.get("code"), str) + and warning.get("code") in REPORTABLE_WARNING_CODES + ) ] actual_projected_findings = [ project_report_finding(finding) @@ -178,7 +182,7 @@ def diagnose_security_report_example( if not isinstance(finding, dict): continue code = finding.get("code") - if code not in WARNING_DERIVED_FINDING_CODES: + if not isinstance(code, str) or code not in WARNING_DERIVED_FINDING_CODES: continue if projected_finding_fields_missing(finding): continue @@ -288,7 +292,14 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) for warning in parser_warnings: if not isinstance(warning, dict): continue + if "code" not in warning: + continue code = warning.get("code") + if not isinstance(code, str): + diagnostics.append( + f"{ctx}: parser warning {warning_id(warning)} code must be a string" + ) + continue if code in SECURITY_WARNING_CODES: diagnostics.append( f"{ctx}: parser warning {warning_id(warning)} ({code}) " @@ -298,8 +309,15 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) for warning in security_warnings: if not isinstance(warning, dict): continue + if "code" not in warning: + continue code = warning.get("code") - if isinstance(code, str) and code not in SECURITY_WARNING_CODES: + if not isinstance(code, str): + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)} code must be a string" + ) + continue + if code not in SECURITY_WARNING_CODES: diagnostics.append( f"{ctx}: security warning {warning_id(warning)} ({code}) " "is not a security warning code" @@ -311,6 +329,8 @@ def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): if not isinstance(warning, dict): continue code = warning.get("code") + if not isinstance(code, str): + continue expected_message = FINDING_MESSAGE_TEMPLATES.get(code) if expected_message is None: continue @@ -323,10 +343,13 @@ def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): def projected_warning_finding(warning): + code = warning.get("code") projected = { - "code": warning.get("code"), + "code": code, "message": warning.get("message"), - "excluded_from_default_chunks": warning.get("code") in DEFAULT_CHUNK_EXCLUDED_CODES, + "excluded_from_default_chunks": ( + isinstance(code, str) and code in DEFAULT_CHUNK_EXCLUDED_CODES + ), } for key in ("page", "element_ref", "span_ref"): if key in warning: @@ -545,6 +568,8 @@ def diagnose_finding_messages(findings, ctx, diagnostics): if not isinstance(finding, dict): continue code = finding.get("code") + if not isinstance(code, str): + continue expected_message = FINDING_MESSAGE_TEMPLATES.get(code) if expected_message is None: continue @@ -879,13 +904,12 @@ def check_bbox(bbox, page, refs, ctx, item_ctx, diagnostics): def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): - if finding.get("code") not in TEXT_BACKED_FINDING_CODES: + code = finding.get("code") + if not isinstance(code, str) or code not in TEXT_BACKED_FINDING_CODES: return span_ref = finding.get("span_ref") if span_ref is None: - diagnostics.append( - f"{ctx}: {item_ctx} requires span_ref for {finding.get('code')}" - ) + diagnostics.append(f"{ctx}: {item_ctx} requires span_ref for {code}") return if not is_span_ref(span_ref): return diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 458bb18..921439c 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -338,7 +338,7 @@ def test_finding_codes_must_be_security_report_codes(self) -> None: ) def test_finding_codes_are_required(self) -> None: - for value in (None, 7): + for value in (None, 7, []): with self.subTest(value=value): report = copy.deepcopy(self.report) if value is None: @@ -1492,6 +1492,29 @@ def test_security_codes_in_parser_warnings_fail_closed(self) -> None: diagnostics, ) + def test_parser_warning_codes_must_be_strings(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["parser_warnings"].append( + { + "id": "w0099", + "code": [], + "message": "parser warning code drifted", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: parser warning w0099 code must be a string", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: parser warning w0099 ([]) " + "must be in security_warnings", + diagnostics, + ) + def test_parser_codes_in_security_warnings_fail_closed(self) -> None: document = copy.deepcopy(self.document) document["payload"]["security_warnings"].append( @@ -1511,6 +1534,28 @@ def test_parser_codes_in_security_warnings_fail_closed(self) -> None: diagnostics, ) + def test_security_warning_codes_must_be_strings(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": [], + "message": "security warning code drifted", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099 code must be a string", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: missing warning-derived finding for []", + diagnostics, + ) + if __name__ == "__main__": unittest.main() From fbcd5ffc811387e468e1b2a157088049e6109c46 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 23:56:50 +0530 Subject: [PATCH 38/51] Validate null security report pages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 4 ++-- schemas/test_security_report_validation.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 234ac7d..d91f273 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -789,7 +789,7 @@ def diagnose_findings_references(findings, refs, ctx, diagnostics): item_ctx = finding_ctx(finding, index) page = finding.get("page") page_shape_valid = True - if page is not None: + if "page" in finding: page_shape_valid = check_page_shape(page, ctx, item_ctx, diagnostics) if page_shape_valid: check_page_ref(page, refs, ctx, item_ctx, diagnostics) @@ -812,7 +812,7 @@ def diagnose_inventory_references(inventory_lists, refs, ctx, diagnostics): item_ctx = f"inventories.{name}[{index}]" page = item.get("page") page_shape_valid = True - if page is not None: + if "page" in item: page_shape_valid = check_page_shape(page, ctx, item_ctx, diagnostics) if page_shape_valid: check_page_ref(page, refs, ctx, item_ctx, diagnostics) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 921439c..e884c06 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -766,7 +766,7 @@ def test_finding_page_refs_must_exist_in_document(self) -> None: ) def test_finding_page_refs_must_match_schema_pattern(self) -> None: - for value in ("page-1", []): + for value in ("page-1", [], None): with self.subTest(value=value): report = copy.deepcopy(self.report) report["findings"][1]["page"] = value @@ -1120,6 +1120,7 @@ def test_inventory_page_refs_must_match_schema_pattern(self) -> None: cases = ( ("annotations", "page-1"), ("actions", []), + ("actions", None), ("scripts", "1"), ("links", []), ) From 4a5fb6d19df3b938467eb7e89aeeaa468242f979 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 00:01:54 +0530 Subject: [PATCH 39/51] Validate security report script triggers Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 5 +++++ schemas/test_security_report_validation.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index d91f273..9d9db0c 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -752,6 +752,11 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): f"{ctx}: inventories.scripts[{index}].location must be a " "supported script location" ) + trigger = item.get("trigger") + if isinstance(trigger, str) and trigger != trigger.lower(): + diagnostics.append( + f"{ctx}: inventories.scripts[{index}].trigger must be lowercase" + ) for index, item in enumerate(inventory_lists.get("links", [])): if not isinstance(item, dict) or "external" not in item: diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index e884c06..816585b 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1443,6 +1443,25 @@ def test_script_inventory_location_must_be_supported(self) -> None: diagnostics, ) + def test_script_inventory_trigger_must_be_lowercase(self) -> None: + report = copy.deepcopy(self.report) + report["inventories"]["scripts"] = [ + {"location": "document", "trigger": "Open"} + ] + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: inventories.scripts[0].trigger " + "must be lowercase", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: inventories.scripts[0].trigger " + "must be a string", + diagnostics, + ) + def test_action_inventory_shape_is_checked_without_action_semantics(self) -> None: report = copy.deepcopy(self.report) report["inventories"]["actions"] = {"kind": "uri"} From 7c939271fe97137762f2069825d67181c6a006d5 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 00:37:15 +0530 Subject: [PATCH 40/51] Validate null security report locators Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 8 +++---- schemas/test_security_report_validation.py | 26 +++++++++++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 9d9db0c..4d2026d 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -826,9 +826,9 @@ def diagnose_inventory_references(inventory_lists, refs, ctx, diagnostics): def check_locator_ref(item, key, ref_kind, refs, ctx, item_ctx, diagnostics): - ref = item.get(key) - if ref is None: + if key not in item: return + ref = item.get(key) if not check_locator_shape(ref, key, ctx, item_ctx, diagnostics): return target = refs[ref_kind].get(ref) @@ -912,10 +912,10 @@ def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): code = finding.get("code") if not isinstance(code, str) or code not in TEXT_BACKED_FINDING_CODES: return - span_ref = finding.get("span_ref") - if span_ref is None: + if "span_ref" not in finding: diagnostics.append(f"{ctx}: {item_ctx} requires span_ref for {code}") return + span_ref = finding.get("span_ref") if not is_span_ref(span_ref): return span = refs["spans"].get(span_ref) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 816585b..fe2014e 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -800,7 +800,7 @@ def test_finding_element_refs_must_exist_in_document(self) -> None: ) def test_finding_element_refs_must_match_schema_pattern(self) -> None: - for value in ("element-1", []): + for value in ("element-1", [], None): with self.subTest(value=value): report = copy.deepcopy(self.report) report["findings"][1]["element_ref"] = value @@ -862,6 +862,30 @@ def test_finding_span_refs_must_match_schema_pattern(self) -> None: for diagnostic in diagnostics ) ) + self.assertFalse( + any("requires span_ref" in diagnostic for diagnostic in diagnostics) + ) + + def test_present_null_finding_span_refs_must_match_schema_pattern(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][1]["span_ref"] = None + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0002.span_ref " + "must match pattern ^s[0-9]{6}$", + diagnostics, + ) + self.assertFalse( + any( + diagnostic.startswith( + "security-report.example.json: finding f0002 " + "references unknown span_ref" + ) + for diagnostic in diagnostics + ) + ) def test_finding_span_refs_must_match_finding_page(self) -> None: document = copy.deepcopy(self.document) From 2b431caff0faa5f8de0e6a39073854e6762b5a99 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 00:47:22 +0530 Subject: [PATCH 41/51] Validate security report inventory kinds Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 10 ++++++++++ schemas/test_security_report_validation.py | 23 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 4d2026d..79365e1 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -728,6 +728,16 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): f"{ctx}: inventories.annotations[{index}].supported must be a boolean" ) + for name in ("annotations", "actions"): + for index, item in enumerate(inventory_lists.get(name, [])): + if not isinstance(item, dict): + continue + kind = item.get("kind") + if isinstance(kind, str) and kind != kind.lower(): + diagnostics.append( + f"{ctx}: inventories.{name}[{index}].kind must be lowercase" + ) + for index, item in enumerate(inventory_lists.get("attachments", [])): if not isinstance(item, dict) or "bytes" not in item: continue diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index fe2014e..9ab4a21 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1403,6 +1403,29 @@ def test_inventory_string_fields_must_be_strings(self) -> None: diagnostics, ) + def test_inventory_kind_fields_must_be_lowercase(self) -> None: + cases = ( + ("annotations", "Link"), + ("actions", "URI"), + ) + for name, value in cases: + with self.subTest(name=name): + report = copy.deepcopy(self.report) + report["inventories"][name][0]["kind"] = value + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + f"security-report.example.json: inventories.{name}[0].kind " + "must be lowercase", + diagnostics, + ) + self.assertNotIn( + f"security-report.example.json: inventories.{name}[0].kind " + "must be a string", + diagnostics, + ) + def test_attachment_inventory_sha256_must_be_lowercase_hex_digest(self) -> None: for value in ("abc", "g" * 64, "A" * 64, 64, None): with self.subTest(value=value): From da4106edbf0b51419f73e0c3035d74a7e04b46d6 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 00:53:08 +0530 Subject: [PATCH 42/51] Split security report code diagnostics Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 7 ++++- schemas/test_security_report_validation.py | 30 ++++++++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 79365e1..49f06f9 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -541,9 +541,14 @@ def diagnose_finding_codes(findings, ctx, diagnostics): for index, finding in enumerate(findings): if not isinstance(finding, dict): continue + if "code" not in finding: + diagnostics.append(f"{ctx}: {finding_ctx(finding, index)} code is required") + continue code = finding.get("code") if not isinstance(code, str): - diagnostics.append(f"{ctx}: {finding_ctx(finding, index)} code is required") + diagnostics.append( + f"{ctx}: {finding_ctx(finding, index)}.code must be a string" + ) continue if code not in SECURITY_WARNING_CODES: diagnostics.append( diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 9ab4a21..d3f6e49 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -338,18 +338,38 @@ def test_finding_codes_must_be_security_report_codes(self) -> None: ) def test_finding_codes_are_required(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0].pop("code") + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: finding f0001 code is required", + diagnostics, + ) + + def test_finding_codes_must_be_strings(self) -> None: for value in (None, 7, []): with self.subTest(value=value): report = copy.deepcopy(self.report) - if value is None: - report["findings"][0].pop("code") - else: - report["findings"][0]["code"] = value + report["findings"].append( + { + "id": "f0004", + "code": value, + "message": "unknown", + "page": "p0001", + "excluded_from_default_chunks": False, + } + ) diagnostics = diagnose_security_report_example(self.document, report) self.assertIn( - "security-report.example.json: finding f0001 code is required", + "security-report.example.json: finding f0004.code must be a string", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: finding f0004 code is required", diagnostics, ) From 0b4a3478d3175fa66a1c6251021c8aaf91691e40 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 00:59:41 +0530 Subject: [PATCH 43/51] Require security warning codes Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 6 +++ schemas/test_security_report_validation.py | 44 ++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 49f06f9..ba49895 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -293,6 +293,9 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) if not isinstance(warning, dict): continue if "code" not in warning: + diagnostics.append( + f"{ctx}: parser warning {warning_id(warning)} code is required" + ) continue code = warning.get("code") if not isinstance(code, str): @@ -310,6 +313,9 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) if not isinstance(warning, dict): continue if "code" not in warning: + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)} code is required" + ) continue code = warning.get("code") if not isinstance(code, str): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index d3f6e49..5291d99 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1602,6 +1602,28 @@ def test_parser_warning_codes_must_be_strings(self) -> None: diagnostics, ) + def test_parser_warning_codes_are_required(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["parser_warnings"].append( + { + "id": "w0099", + "message": "parser warning code missing", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: parser warning w0099 code is required", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: parser warning w0099 " + "must be in security_warnings", + diagnostics, + ) + def test_parser_codes_in_security_warnings_fail_closed(self) -> None: document = copy.deepcopy(self.document) document["payload"]["security_warnings"].append( @@ -1643,6 +1665,28 @@ def test_security_warning_codes_must_be_strings(self) -> None: diagnostics, ) + def test_security_warning_codes_are_required(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "message": "security warning code missing", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099 code is required", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: security warning w0099 " + "is not a security warning code", + diagnostics, + ) + if __name__ == "__main__": unittest.main() From a619939177995e44317d107da2d52baf685988e8 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 01:04:42 +0530 Subject: [PATCH 44/51] Validate security warning locator shapes Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 13 +++++++ schemas/test_security_report_validation.py | 42 ++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index ba49895..1461a82 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -137,6 +137,7 @@ def diagnose_security_report_example( diagnose_report_identity_scalar_fields(report, ctx, diagnostics) diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) + diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] @@ -348,6 +349,18 @@ def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): ) +def diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics): + for warning in security_warnings: + if not isinstance(warning, dict): + continue + item_ctx = f"security warning {warning_id(warning)}" + if "page" in warning: + check_page_shape(warning.get("page"), ctx, item_ctx, diagnostics) + for key in ("element_ref", "span_ref"): + if key in warning: + check_locator_shape(warning.get(key), key, ctx, item_ctx, diagnostics) + + def projected_warning_finding(warning): code = warning.get("code") projected = { diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 5291d99..b170fca 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1687,6 +1687,48 @@ def test_security_warning_codes_are_required(self) -> None: diagnostics, ) + def test_security_warning_locator_fields_must_match_schema_patterns(self) -> None: + cases = ( + ( + "page", + "page-1", + "security-report.example.json: security warning w0099.page " + "must match pattern ^p[0-9]{4}$", + ), + ( + "element_ref", + [], + "security-report.example.json: security warning w0099.element_ref " + "must match pattern ^e[0-9]{6}$", + ), + ( + "span_ref", + None, + "security-report.example.json: security warning w0099.span_ref " + "must match pattern ^s[0-9]{6}$", + ), + ) + for field, value, expected_diagnostic in cases: + with self.subTest(field=field): + document = copy.deepcopy(self.document) + warning = { + "id": "w0099", + "code": "image_only_page", + "message": "image-only page", + "page": "p0001", + } + warning[field] = value + document["payload"]["security_warnings"].append(warning) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn(expected_diagnostic, diagnostics) + self.assertNotIn( + f"security-report.example.json: security warning w0099 " + f"references unknown {field}", + diagnostics, + ) + if __name__ == "__main__": unittest.main() From 78f3dda40bed15cd0df0e1f44683023e88c788c6 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 01:08:40 +0530 Subject: [PATCH 45/51] Split security warning message diagnostics Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 10 +++++ schemas/test_security_report_validation.py | 46 ++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 1461a82..7c8f77d 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -341,7 +341,17 @@ def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): expected_message = FINDING_MESSAGE_TEMPLATES.get(code) if expected_message is None: continue + if "message" not in warning: + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)}.message is required" + ) + continue actual_message = warning.get("message") + if not isinstance(actual_message, str): + diagnostics.append( + f"{ctx}: security warning {warning_id(warning)}.message must be a string" + ) + continue if actual_message != expected_message: diagnostics.append( f"{ctx}: security warning {warning_id(warning)} " diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index b170fca..1a6fe98 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -487,6 +487,52 @@ def test_security_warning_message_must_match_fixed_template(self) -> None: diagnostics, ) + def test_security_warning_messages_are_required(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": "image_only_page", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099.message is required", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: security warning w0099 message " + "must match fixed template for image_only_page", + diagnostics, + ) + + def test_security_warning_messages_must_be_strings(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"].append( + { + "id": "w0099", + "code": "image_only_page", + "message": [], + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: security warning w0099.message " + "must be a string", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: security warning w0099 message " + "must match fixed template for image_only_page", + diagnostics, + ) + def test_text_exclusion_finding_messages_must_match_fixed_templates(self) -> None: for code, changed_message in ( ("off_page_text_detected", "off-page text changed"), From 75376348ca77a12875ad2842be4021c84b04826a Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 01:12:18 +0530 Subject: [PATCH 46/51] Validate document warning ids Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 28 +++++++++++++++++ schemas/test_security_report_validation.py | 35 ++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 7c8f77d..046bc07 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -136,6 +136,7 @@ def diagnose_security_report_example( diagnose_report_required_fields(report, ctx, diagnostics) diagnose_report_identity_scalar_fields(report, ctx, diagnostics) diagnose_report_identity(document, report, ctx, diagnostics) + diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) @@ -289,6 +290,24 @@ def warning_items(value): return [] +def diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics): + lanes = ( + ("security_warnings", security_warnings), + ("parser_warnings", parser_warnings), + ) + for lane, warnings in lanes: + for index, warning in enumerate(warnings): + if not isinstance(warning, dict): + continue + if "id" not in warning: + diagnostics.append(f"{ctx}: {lane}[{index}].id is required") + continue + if not is_warning_id(warning.get("id")): + diagnostics.append( + f"{ctx}: {lane}[{index}].id must match pattern ^w[0-9]{{4}}$" + ) + + def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics): for warning in parser_warnings: if not isinstance(warning, dict): @@ -1082,6 +1101,15 @@ def is_finding_id(value): ) +def is_warning_id(value): + return ( + isinstance(value, str) + and len(value) == 5 + and value.startswith("w") + and is_ascii_digits(value[1:]) + ) + + def is_ascii_digits(value): return ( isinstance(value, str) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 1a6fe98..1f6346f 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1625,6 +1625,41 @@ def test_security_codes_in_parser_warnings_fail_closed(self) -> None: diagnostics, ) + def test_warning_ids_are_required(self) -> None: + cases = ( + ("security_warnings", "security_warnings[0].id is required"), + ("parser_warnings", "parser_warnings[0].id is required"), + ) + for lane, expected_suffix in cases: + with self.subTest(lane=lane): + document = copy.deepcopy(self.document) + document["payload"][lane][0].pop("id") + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + f"security-report.example.json: {expected_suffix}", + diagnostics, + ) + + def test_warning_ids_must_match_schema_pattern(self) -> None: + cases = ( + ("security_warnings", "warning-1"), + ("parser_warnings", []), + ) + for lane, value in cases: + with self.subTest(lane=lane, value=value): + document = copy.deepcopy(self.document) + document["payload"][lane][0]["id"] = value + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + f"security-report.example.json: {lane}[0].id must match " + "pattern ^w[0-9]{4}$", + diagnostics, + ) + def test_parser_warning_codes_must_be_strings(self) -> None: document = copy.deepcopy(self.document) document["payload"]["parser_warnings"].append( From 79964a44e12a69a48a77900a3a7f83b2a0bda041 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 01:17:48 +0530 Subject: [PATCH 47/51] Validate parser warning messages Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 17 ++++++++++++++ schemas/test_security_report_validation.py | 26 ++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 046bc07..6e4b501 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -138,6 +138,7 @@ def diagnose_security_report_example( diagnose_report_identity(document, report, ctx, diagnostics) diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) + diagnose_parser_warning_messages(parser_warnings, ctx, diagnostics) diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) @@ -350,6 +351,22 @@ def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) ) +def diagnose_parser_warning_messages(parser_warnings, ctx, diagnostics): + for warning in parser_warnings: + if not isinstance(warning, dict): + continue + if "message" not in warning: + diagnostics.append( + f"{ctx}: parser warning {warning_id(warning)}.message is required" + ) + continue + actual_message = warning.get("message") + if not isinstance(actual_message, str): + diagnostics.append( + f"{ctx}: parser warning {warning_id(warning)}.message must be a string" + ) + + def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): for warning in security_warnings: if not isinstance(warning, dict): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 1f6346f..ab9ead3 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1705,6 +1705,32 @@ def test_parser_warning_codes_are_required(self) -> None: diagnostics, ) + def test_parser_warning_messages_are_required(self) -> None: + document = copy.deepcopy(self.document) + parser_warning_id = document["payload"]["parser_warnings"][0]["id"] + document["payload"]["parser_warnings"][0].pop("message") + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: " + f"parser warning {parser_warning_id}.message is required", + diagnostics, + ) + + def test_parser_warning_messages_must_be_strings(self) -> None: + document = copy.deepcopy(self.document) + parser_warning_id = document["payload"]["parser_warnings"][0]["id"] + document["payload"]["parser_warnings"][0]["message"] = [] + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: " + f"parser warning {parser_warning_id}.message must be a string", + diagnostics, + ) + def test_parser_codes_in_security_warnings_fail_closed(self) -> None: document = copy.deepcopy(self.document) document["payload"]["security_warnings"].append( From 52613f367c09c432e8a0b4f092c2bc6dd3a5c0ae Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 08:43:05 +0530 Subject: [PATCH 48/51] Reject duplicate warning ids Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 8 +++++++- schemas/test_security_report_validation.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index 6e4b501..b792b6b 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -296,6 +296,7 @@ def diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics): ("security_warnings", security_warnings), ("parser_warnings", parser_warnings), ) + seen = set() for lane, warnings in lanes: for index, warning in enumerate(warnings): if not isinstance(warning, dict): @@ -303,10 +304,15 @@ def diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics): if "id" not in warning: diagnostics.append(f"{ctx}: {lane}[{index}].id is required") continue - if not is_warning_id(warning.get("id")): + warning_id_value = warning.get("id") + if not is_warning_id(warning_id_value): diagnostics.append( f"{ctx}: {lane}[{index}].id must match pattern ^w[0-9]{{4}}$" ) + continue + if warning_id_value in seen: + diagnostics.append(f"{ctx}: duplicate warning id {warning_id_value}") + seen.add(warning_id_value) def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index ab9ead3..2e524fa 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1660,6 +1660,28 @@ def test_warning_ids_must_match_schema_pattern(self) -> None: diagnostics, ) + def test_warning_ids_must_be_unique_across_warning_lanes(self) -> None: + cases = ( + ("security_warnings", "security_warnings", "w0001"), + ("parser_warnings", "parser_warnings", "w0002"), + ("security_warnings", "parser_warnings", "w0001"), + ) + for source_lane, target_lane, duplicate_id in cases: + with self.subTest(source_lane=source_lane, target_lane=target_lane): + document = copy.deepcopy(self.document) + document["payload"][target_lane][0]["id"] = duplicate_id + if source_lane == target_lane: + document["payload"][target_lane].append( + copy.deepcopy(document["payload"][target_lane][0]) + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + f"security-report.example.json: duplicate warning id {duplicate_id}", + diagnostics, + ) + def test_parser_warning_codes_must_be_strings(self) -> None: document = copy.deepcopy(self.document) document["payload"]["parser_warnings"].append( From 91922a60595913694920d35b0034d7bc616da605 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 08:46:40 +0530 Subject: [PATCH 49/51] Enforce deterministic warning numbering Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 30 ++++++++++++++++++++++ schemas/test_security_report_validation.py | 16 ++++++++++++ 2 files changed, 46 insertions(+) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index b792b6b..dc7376a 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -297,6 +297,7 @@ def diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics): ("parser_warnings", parser_warnings), ) seen = set() + numbered = [] for lane, warnings in lanes: for index, warning in enumerate(warnings): if not isinstance(warning, dict): @@ -313,6 +314,35 @@ def diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics): if warning_id_value in seen: diagnostics.append(f"{ctx}: duplicate warning id {warning_id_value}") seen.add(warning_id_value) + sort_key = warning_sort_key(warning) + if sort_key is not None: + numbered.append((sort_key, lane, index, warning_id_value)) + + for ordinal, (_, lane, index, actual_id) in enumerate(sorted(numbered), start=1): + expected_id = f"w{ordinal:04d}" + if actual_id != expected_id: + diagnostics.append( + f"{ctx}: {lane}[{index}].id must be {expected_id} " + "for deterministic numbering" + ) + + +def warning_sort_key(warning): + code = warning.get("code") + message = warning.get("message") + if not isinstance(code, str) or not isinstance(message, str): + return None + key = [code] + for field in ("page", "element_ref", "span_ref", "region_ref"): + if field not in warning: + key.append((0, "")) + continue + value = warning.get(field) + if not isinstance(value, str): + return None + key.append((1, value)) + key.append(message) + return tuple(key) def diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 2e524fa..7a90efd 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1682,6 +1682,22 @@ def test_warning_ids_must_be_unique_across_warning_lanes(self) -> None: diagnostics, ) + def test_warning_ids_follow_deterministic_numbering(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["parser_warnings"][0]["id"] = "w0004" + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: parser_warnings[0].id must be w0002 " + "for deterministic numbering", + diagnostics, + ) + self.assertNotIn( + "security-report.example.json: duplicate warning id w0004", + diagnostics, + ) + def test_parser_warning_codes_must_be_strings(self) -> None: document = copy.deepcopy(self.document) document["payload"]["parser_warnings"].append( From 4f2bf229711bcd2d5ced90de479aee9e42ce38cb Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 09:04:53 +0530 Subject: [PATCH 50/51] Add Milestone C internal checks Signed-off-by: docushell-admin --- .../test_milestone_c_internal_checks.py | 83 +++++++++++++++++++ Makefile | 8 +- 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 .github/scripts/test_milestone_c_internal_checks.py diff --git a/.github/scripts/test_milestone_c_internal_checks.py b/.github/scripts/test_milestone_c_internal_checks.py new file mode 100644 index 0000000..b1d9ebd --- /dev/null +++ b/.github/scripts/test_milestone_c_internal_checks.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +MAKEFILE = ROOT / "Makefile" + + +def makefile_text() -> str: + return MAKEFILE.read_text(encoding="utf-8") + + +def target_block(target: str) -> str: + lines = makefile_text().splitlines() + start = None + for index, line in enumerate(lines): + if line == f"{target}:": + start = index + 1 + break + if start is None: + raise AssertionError(f"{target} target is missing") + + block: list[str] = [] + for line in lines[start:]: + if line and not line.startswith(("\t", " ")): + break + block.append(line) + return "\n".join(block) + + +class MilestoneCInternalCheckTests(unittest.TestCase): + def test_target_is_declared_phony(self) -> None: + text = makefile_text() + + self.assertIn(".PHONY:", text) + self.assertIn("milestone-c-internal-checks", text) + + def test_target_composes_current_artifact_gates(self) -> None: + block = target_block("milestone-c-internal-checks") + + required = [ + "$(MAKE) rag-chunk-alpha PYTHON=$(PYTHON)", + "$(MAKE) security-report-alpha PYTHON=$(PYTHON)", + "$(PYTHON) .github/scripts/test_milestone_c_internal_checks.py", + "git diff --check", + ] + for command in required: + self.assertIn(command, block) + + def test_target_stays_current_artifact_scoped(self) -> None: + block = target_block("milestone-c-internal-checks") + + self.assertNotIn("verify-alpha", block) + self.assertNotIn("layout-evaluator-alpha", block) + self.assertNotIn("python-surface-test", block) + self.assertNotIn("verify-rendered-crops", block) + self.assertNotIn("compare-rendered-crops", block) + self.assertNotIn("release-", block) + self.assertNotIn("third-party-license-manifest", block) + self.assertNotIn("release-notice-draft", block) + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile b/Makefile index d3536c9..b5766ed 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ COMPARE_RENDERED_CROPS_LEFT ?= $(VERIFY_RENDERED_CROPS_OUT)/run1 COMPARE_RENDERED_CROPS_RIGHT ?= $(VERIFY_RENDERED_CROPS_OUT)/run2 LAYOUT_EVALUATOR_OUT ?= $(ROOT)/target/layout-evaluator-alpha -.PHONY: verify-alpha verify-alpha-tree rag-chunk-alpha security-report-alpha verify-rendered-crops compare-rendered-crops layout-evaluator-alpha python-surface-test milestone-b-internal-checks release-hygiene release-advisory third-party-license-manifest release-notice-draft +.PHONY: verify-alpha verify-alpha-tree rag-chunk-alpha security-report-alpha verify-rendered-crops compare-rendered-crops layout-evaluator-alpha python-surface-test milestone-b-internal-checks milestone-c-internal-checks release-hygiene release-advisory third-party-license-manifest release-notice-draft $(ETHOS_BIN): cargo build --locked -p ethos-cli @@ -77,6 +77,12 @@ milestone-b-internal-checks: $(PYTHON) .github/scripts/readiness_gate.py public git diff --check +milestone-c-internal-checks: + $(MAKE) rag-chunk-alpha PYTHON=$(PYTHON) + $(MAKE) security-report-alpha PYTHON=$(PYTHON) + $(PYTHON) .github/scripts/test_milestone_c_internal_checks.py + git diff --check + release-hygiene: cargo metadata --locked --offline --format-version 1 --no-deps >/dev/null $(CARGO_DENY) --version From a0fe7a2f9bf7b983d93be76654734dd9d48eefee Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 09:07:48 +0530 Subject: [PATCH 51/51] Ground security warning locators Signed-off-by: docushell-admin --- schemas/security_report_validation.py | 35 ++++++++++++++---- schemas/test_security_report_validation.py | 42 ++++++++++++++++++++++ 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index dc7376a..fcfd837 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -139,7 +139,7 @@ def diagnose_security_report_example( diagnose_warning_ids(security_warnings, parser_warnings, ctx, diagnostics) diagnose_warning_lanes(security_warnings, parser_warnings, ctx, diagnostics) diagnose_parser_warning_messages(parser_warnings, ctx, diagnostics) - diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics) + diagnose_security_warning_locator_shapes(security_warnings, refs, ctx, diagnostics) diagnose_security_warning_messages(security_warnings, ctx, diagnostics) findings = report.get("findings") if isinstance(report, dict) else [] @@ -431,16 +431,24 @@ def diagnose_security_warning_messages(security_warnings, ctx, diagnostics): ) -def diagnose_security_warning_locator_shapes(security_warnings, ctx, diagnostics): +def diagnose_security_warning_locator_shapes(security_warnings, refs, ctx, diagnostics): for warning in security_warnings: if not isinstance(warning, dict): continue item_ctx = f"security warning {warning_id(warning)}" + page_shape_valid = True if "page" in warning: - check_page_shape(warning.get("page"), ctx, item_ctx, diagnostics) - for key in ("element_ref", "span_ref"): - if key in warning: - check_locator_shape(warning.get(key), key, ctx, item_ctx, diagnostics) + page_shape_valid = check_page_shape( + warning.get("page"), ctx, item_ctx, diagnostics + ) + if page_shape_valid: + check_page_ref(warning.get("page"), refs, ctx, item_ctx, diagnostics) + for key, ref_kind in ( + ("element_ref", "elements"), + ("span_ref", "spans"), + ("region_ref", "regions"), + ): + check_locator_ref(warning, key, ref_kind, refs, ctx, item_ctx, diagnostics) def projected_warning_finding(warning): @@ -885,11 +893,12 @@ def diagnose_inventory_scalar_fields(inventory_lists, ctx, diagnostics): def document_reference_index(payload): if not isinstance(payload, dict): - return {"pages": {}, "elements": {}, "spans": {}} + return {"pages": {}, "elements": {}, "spans": {}, "regions": {}} return { "pages": keyed_objects(payload.get("pages", [])), "elements": keyed_objects(payload.get("elements", [])), "spans": keyed_objects(payload.get("spans", [])), + "regions": keyed_objects(payload.get("regions", [])), } @@ -989,6 +998,9 @@ def check_locator_shape(ref, key, ctx, item_ctx, diagnostics): elif key == "span_ref": pattern = "^s[0-9]{6}$" valid = is_span_ref(ref) + elif key == "region_ref": + pattern = "^r[0-9]{4}$" + valid = is_region_ref(ref) else: return True if not valid: @@ -1145,6 +1157,15 @@ def is_span_ref(value): ) +def is_region_ref(value): + return ( + isinstance(value, str) + and len(value) == 5 + and value.startswith("r") + and is_ascii_digits(value[1:]) + ) + + def is_finding_id(value): return ( isinstance(value, str) diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index 7a90efd..bd3ac10 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -1852,6 +1852,12 @@ def test_security_warning_locator_fields_must_match_schema_patterns(self) -> Non "security-report.example.json: security warning w0099.span_ref " "must match pattern ^s[0-9]{6}$", ), + ( + "region_ref", + [], + "security-report.example.json: security warning w0099.region_ref " + "must match pattern ^r[0-9]{4}$", + ), ) for field, value, expected_diagnostic in cases: with self.subTest(field=field): @@ -1874,6 +1880,42 @@ def test_security_warning_locator_fields_must_match_schema_patterns(self) -> Non diagnostics, ) + def test_security_warning_locator_refs_must_exist_in_document(self) -> None: + cases = ( + ( + "page", + "p9999", + "security-report.example.json: security warning w0001 " + "references unknown page p9999", + ), + ( + "element_ref", + "e999999", + "security-report.example.json: security warning w0001 " + "references unknown element_ref e999999", + ), + ( + "span_ref", + "s999999", + "security-report.example.json: security warning w0001 " + "references unknown span_ref s999999", + ), + ( + "region_ref", + "r9999", + "security-report.example.json: security warning w0001 " + "references unknown region_ref r9999", + ), + ) + for field, value, expected_diagnostic in cases: + with self.subTest(field=field): + document = copy.deepcopy(self.document) + document["payload"]["security_warnings"][0][field] = value + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn(expected_diagnostic, diagnostics) + if __name__ == "__main__": unittest.main()