From 9ccae8c32a47400ef1eda250bdc198d60facf609 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 10:30:57 +0530 Subject: [PATCH] Harden rag chunk fixture golden path Signed-off-by: docushell-admin --- .github/scripts/test_ci_workflow.py | 1 + .github/scripts/test_rag_chunk_alpha.py | 78 +++++++++++++++++ .github/workflows/ci.yml | 2 + Makefile | 8 +- crates/ethos-cli/src/cmd/rag.rs | 109 +++++++++++++++++++----- crates/ethos-cli/tests/rag.rs | 71 ++++++++++++++- 6 files changed, 244 insertions(+), 25 deletions(-) create mode 100644 .github/scripts/test_rag_chunk_alpha.py diff --git a/.github/scripts/test_ci_workflow.py b/.github/scripts/test_ci_workflow.py index 46de6f0..ad37f62 100644 --- a/.github/scripts/test_ci_workflow.py +++ b/.github/scripts/test_ci_workflow.py @@ -48,6 +48,7 @@ def test_ci_workflow_guard_is_run_by_ci(self) -> None: self.assertIn("python3 .github/scripts/test_ci_workflow.py", text) self.assertIn("python3 .github/scripts/test_milestone_b_internal_checks.py", text) + self.assertIn("python3 .github/scripts/test_rag_chunk_alpha.py", text) self.assertIn("python3 .github/scripts/test_execution_status.py", text) self.assertIn("python3 .github/scripts/test_roadmap_status.py", text) self.assertIn("python3 .github/scripts/test_milestone_b_closeout_record.py", text) diff --git a/.github/scripts/test_rag_chunk_alpha.py b/.github/scripts/test_rag_chunk_alpha.py new file mode 100644 index 0000000..68d47af --- /dev/null +++ b/.github/scripts/test_rag_chunk_alpha.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +MAKEFILE = ROOT / "Makefile" + + +def makefile_text() -> str: + return MAKEFILE.read_text(encoding="utf-8") + + +def target_block(target: str) -> str: + lines = makefile_text().splitlines() + start = None + for index, line in enumerate(lines): + if line == f"{target}:": + start = index + 1 + break + if start is None: + raise AssertionError(f"{target} target is missing") + + block: list[str] = [] + for line in lines[start:]: + if line and not line.startswith(("\t", " ")): + break + block.append(line) + return "\n".join(block) + + +class RagChunkAlphaTests(unittest.TestCase): + def test_target_is_declared_phony(self) -> None: + text = makefile_text() + + self.assertIn(".PHONY:", text) + self.assertIn("rag-chunk-alpha", text) + + def test_target_composes_rag_artifact_gates(self) -> None: + block = target_block("rag-chunk-alpha") + + required = [ + "cargo test --locked -p ethos-cli --test rag", + "$(PYTHON) schemas/validate_examples.py", + "$(PYTHON) .github/scripts/test_rag_chunk_alpha.py", + "git diff --check", + ] + for command in required: + self.assertIn(command, block) + + def test_target_stays_rag_scoped(self) -> None: + block = target_block("rag-chunk-alpha") + + self.assertNotIn("verify-alpha", block) + self.assertNotIn("layout-evaluator-alpha", block) + self.assertNotIn("python-surface-test", block) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 54a883c..49dc4d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,6 +58,8 @@ jobs: run: python3 .github/scripts/test_ci_workflow.py - name: Milestone B internal check target tests run: python3 .github/scripts/test_milestone_b_internal_checks.py + - name: RAG chunk alpha target tests + run: python3 .github/scripts/test_rag_chunk_alpha.py - name: execution status tests run: python3 .github/scripts/test_execution_status.py - name: roadmap status tests diff --git a/Makefile b/Makefile index 3788e4d..735cafa 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ COMPARE_RENDERED_CROPS_LEFT ?= $(VERIFY_RENDERED_CROPS_OUT)/run1 COMPARE_RENDERED_CROPS_RIGHT ?= $(VERIFY_RENDERED_CROPS_OUT)/run2 LAYOUT_EVALUATOR_OUT ?= $(ROOT)/target/layout-evaluator-alpha -.PHONY: verify-alpha verify-alpha-tree verify-rendered-crops compare-rendered-crops layout-evaluator-alpha python-surface-test milestone-b-internal-checks release-hygiene release-advisory third-party-license-manifest release-notice-draft +.PHONY: verify-alpha verify-alpha-tree rag-chunk-alpha verify-rendered-crops compare-rendered-crops layout-evaluator-alpha python-surface-test milestone-b-internal-checks release-hygiene release-advisory third-party-license-manifest release-notice-draft $(ETHOS_BIN): cargo build --locked -p ethos-cli @@ -34,6 +34,12 @@ verify-alpha: $(ETHOS_BIN) $(PYTHON) examples/verify/check_verify_alpha.py --repo-root $(ROOT) --ethos-bin $(ETHOS_BIN) --out-dir $(VERIFY_ALPHA_OUT) git diff --check +rag-chunk-alpha: + cargo test --locked -p ethos-cli --test rag + $(PYTHON) schemas/validate_examples.py + $(PYTHON) .github/scripts/test_rag_chunk_alpha.py + git diff --check + verify-rendered-crops: $(ETHOS_BIN) $(PYTHON) examples/verify/check_rendered_crops.py --repo-root $(ROOT) --ethos-bin $(ETHOS_BIN) --out-dir $(VERIFY_RENDERED_CROPS_OUT) git diff --check diff --git a/crates/ethos-cli/src/cmd/rag.rs b/crates/ethos-cli/src/cmd/rag.rs index 1249a85..dbecb09 100644 --- a/crates/ethos-cli/src/cmd/rag.rs +++ b/crates/ethos-cli/src/cmd/rag.rs @@ -14,6 +14,8 @@ * limitations under the License. */ +use std::collections::{BTreeMap, BTreeSet}; + use ethos_core::error::EthosError; use ethos_core::model::{Chunk, Document}; @@ -26,9 +28,11 @@ pub(crate) fn rag_chunk(args: RagChunkArgs) -> Result<(), Failure> { } fn rag_chunk_output_bytes(doc: &Document) -> Result, Failure> { + let refs = RagChunkRefs::new(doc); let mut out = Vec::with_capacity(4096); for chunk in &doc.payload.chunks { - let line = ethos_core::c14n::c14n_bytes(&rag_chunk_record(doc, chunk)?) + validate_chunk_refs(chunk, &refs)?; + let line = ethos_core::c14n::c14n_bytes(&rag_chunk_record(chunk, &refs)?) .map_err(|e| EthosError::internal(e.message))?; out.extend_from_slice(&line); out.push(b'\n'); @@ -36,32 +40,99 @@ fn rag_chunk_output_bytes(doc: &Document) -> Result, Failure> { Ok(out) } -fn rag_chunk_record(doc: &Document, chunk: &Chunk) -> Result { - let warning_code = |id: &str| -> Option<&'static str> { - doc.payload - .security_warnings - .iter() - .chain(doc.payload.parser_warnings.iter()) - .find(|w| w.id == id) - .map(|w| w.code.as_str()) - }; +struct RagChunkRefs<'a> { + page_ids: BTreeSet<&'a str>, + element_ids: BTreeSet<&'a str>, + warning_codes: BTreeMap<&'a str, &'a str>, + schema_version: &'a str, + document_fingerprint: &'a str, + source_fingerprint: &'a str, + config_sha256: &'a str, +} + +impl<'a> RagChunkRefs<'a> { + fn new(doc: &'a Document) -> Self { + Self { + page_ids: doc + .payload + .pages + .iter() + .map(|page| page.id.as_str()) + .collect(), + element_ids: doc + .payload + .elements + .iter() + .map(|element| element.id.as_str()) + .collect(), + warning_codes: doc + .payload + .security_warnings + .iter() + .chain(doc.payload.parser_warnings.iter()) + .map(|warning| (warning.id.as_str(), warning.code.as_str())) + .collect(), + schema_version: doc.schema_version.as_str(), + document_fingerprint: doc.fingerprint.as_str(), + source_fingerprint: doc.source.fingerprint.as_str(), + config_sha256: doc.config_sha256.as_str(), + } + } +} + +fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> { + for id in &chunk.element_refs { + if !refs.element_ids.contains(id.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} references unknown element_ref {}", + chunk.id, id + ))); + } + } + for id in &chunk.page_refs { + if !refs.page_ids.contains(id.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} references unknown page_ref {}", + chunk.id, id + ))); + } + } + for (idx, bbox) in chunk.bboxes.iter().enumerate() { + if !refs.page_ids.contains(bbox.page.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} bboxes[{}] references unknown page_ref {}", + chunk.id, idx, bbox.page + ))); + } + } + for id in &chunk.warning_refs { + if !refs.warning_codes.contains_key(id.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} references unknown warning_ref {}", + chunk.id, id + ))); + } + } + Ok(()) +} +fn rag_chunk_record(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result { let mut record = serde_json::Map::new(); record.insert( "schema_version".into(), - serde_json::Value::String(doc.schema_version.clone()), + serde_json::Value::String(refs.schema_version.to_string()), ); record.insert( "document_fingerprint".into(), - serde_json::Value::String(doc.fingerprint.clone()), + serde_json::Value::String(refs.document_fingerprint.to_string()), ); record.insert( "source_fingerprint".into(), - serde_json::Value::String(doc.source.fingerprint.clone()), + serde_json::Value::String(refs.source_fingerprint.to_string()), ); record.insert( "config_sha256".into(), - serde_json::Value::String(doc.config_sha256.clone()), + serde_json::Value::String(refs.config_sha256.to_string()), ); record.insert("id".into(), serde_json::Value::String(chunk.id.clone())); record.insert("text".into(), serde_json::Value::String(chunk.text.clone())); @@ -98,12 +169,10 @@ fn rag_chunk_record(doc: &Document, chunk: &Chunk) -> Result PathBuf { path } -fn document_with_stale_chunk_warning_ref() -> PathBuf { +fn document_with_mutated_chunk(name: &str, mutate: impl FnOnce(&mut Value)) -> PathBuf { let mut doc = json_file(document_example()); - doc["payload"]["chunks"][0]["warning_refs"] = serde_json::json!(["w999999"]); + mutate(&mut doc); let mut doc: Document = serde_json::from_value(doc).expect("document parses"); doc.payload_sha256 = doc.compute_payload_sha256().expect("payload hash computes"); doc.fingerprint = doc.compute_fingerprint().expect("fingerprint computes"); temp_json( - "stale-chunk-warning-ref-document", + name, &serde_json::to_string(&doc).expect("document serializes"), ) } @@ -88,9 +88,72 @@ fn rag_chunk_matches_schema_example_jsonl() { assert_eq!(output.stdout, expected); } +#[test] +fn rag_chunk_output_is_byte_identical_across_runs() { + let first = run_ethos(&["rag", "chunk", document_example().to_str().unwrap()]); + let second = run_ethos(&["rag", "chunk", document_example().to_str().unwrap()]); + + assert!( + first.status.success(), + "first ethos rag chunk failed\nstatus: {:?}\nstderr:\n{}", + first.status.code(), + String::from_utf8_lossy(&first.stderr) + ); + assert!( + second.status.success(), + "second ethos rag chunk failed\nstatus: {:?}\nstderr:\n{}", + second.status.code(), + String::from_utf8_lossy(&second.stderr) + ); + assert_eq!(first.stderr, b""); + assert_eq!(second.stderr, b""); + assert_eq!(first.stdout, second.stdout); +} + +#[test] +fn rag_chunk_rejects_unknown_chunk_element_ref() { + let document = document_with_mutated_chunk("stale-chunk-element-ref-document", |doc| { + doc["payload"]["chunks"][0]["element_refs"][0] = serde_json::json!("e999999"); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 references unknown element_ref e999999")); +} + +#[test] +fn rag_chunk_rejects_unknown_chunk_page_ref() { + let document = document_with_mutated_chunk("stale-chunk-page-ref-document", |doc| { + doc["payload"]["chunks"][0]["page_refs"][0] = serde_json::json!("p9999"); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 references unknown page_ref p9999")); +} + +#[test] +fn rag_chunk_rejects_unknown_chunk_bbox_page_ref() { + let document = document_with_mutated_chunk("stale-chunk-bbox-page-ref-document", |doc| { + doc["payload"]["chunks"][0]["bboxes"][0]["page"] = serde_json::json!("p9999"); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 bboxes[0] references unknown page_ref p9999")); +} + #[test] fn rag_chunk_rejects_unknown_chunk_warning_ref() { - let document = document_with_stale_chunk_warning_ref(); + let document = document_with_mutated_chunk("stale-chunk-warning-ref-document", |doc| { + doc["payload"]["chunks"][0]["warning_refs"] = serde_json::json!(["w999999"]); + }); let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); assert_eq!(output.status.code(), Some(2));