From 0000d3862760613b82affcb2dfde96b8fc6ec17c Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 10:49:20 +0530 Subject: [PATCH] Validate rag chunk provenance consistency Signed-off-by: docushell-admin --- crates/ethos-cli/src/cmd/rag.rs | 98 +++++++++++++++++--- crates/ethos-cli/tests/rag.rs | 157 ++++++++++++++++++++++++++++++++ 2 files changed, 242 insertions(+), 13 deletions(-) diff --git a/crates/ethos-cli/src/cmd/rag.rs b/crates/ethos-cli/src/cmd/rag.rs index dbecb09..14c9435 100644 --- a/crates/ethos-cli/src/cmd/rag.rs +++ b/crates/ethos-cli/src/cmd/rag.rs @@ -40,9 +40,15 @@ fn rag_chunk_output_bytes(doc: &Document) -> Result, Failure> { Ok(out) } +#[derive(Clone, Copy)] +struct PageBounds { + width: i64, + height: i64, +} + struct RagChunkRefs<'a> { - page_ids: BTreeSet<&'a str>, - element_ids: BTreeSet<&'a str>, + page_bounds: BTreeMap<&'a str, PageBounds>, + element_pages: BTreeMap<&'a str, &'a str>, warning_codes: BTreeMap<&'a str, &'a str>, schema_version: &'a str, document_fingerprint: &'a str, @@ -53,17 +59,25 @@ struct RagChunkRefs<'a> { impl<'a> RagChunkRefs<'a> { fn new(doc: &'a Document) -> Self { Self { - page_ids: doc + page_bounds: doc .payload .pages .iter() - .map(|page| page.id.as_str()) + .map(|page| { + ( + page.id.as_str(), + PageBounds { + width: page.width, + height: page.height, + }, + ) + }) .collect(), - element_ids: doc + element_pages: doc .payload .elements .iter() - .map(|element| element.id.as_str()) + .map(|element| (element.id.as_str(), element.page.as_str())) .collect(), warning_codes: doc .payload @@ -81,28 +95,86 @@ impl<'a> RagChunkRefs<'a> { } fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> { - for id in &chunk.element_refs { - if !refs.element_ids.contains(id.as_str()) { + if chunk.element_refs.is_empty() { + return Err(Failure::Usage(format!( + "chunk {} must include at least one element_ref", + chunk.id + ))); + } + if chunk.page_refs.is_empty() { + return Err(Failure::Usage(format!( + "chunk {} must include at least one page_ref", + chunk.id + ))); + } + if chunk.bboxes.is_empty() { + return Err(Failure::Usage(format!( + "chunk {} must include at least one bbox", + chunk.id + ))); + } + + let mut page_refs = BTreeSet::new(); + for id in &chunk.page_refs { + if !refs.page_bounds.contains_key(id.as_str()) { return Err(Failure::Usage(format!( - "chunk {} references unknown element_ref {}", + "chunk {} references unknown page_ref {}", chunk.id, id ))); } + page_refs.insert(id.as_str()); } - for id in &chunk.page_refs { - if !refs.page_ids.contains(id.as_str()) { + + let mut backed_pages = BTreeSet::new(); + for id in &chunk.element_refs { + let Some(page) = refs.element_pages.get(id.as_str()) else { return Err(Failure::Usage(format!( - "chunk {} references unknown page_ref {}", + "chunk {} references unknown element_ref {}", chunk.id, id ))); + }; + if !page_refs.contains(page) { + return Err(Failure::Usage(format!( + "chunk {} element_ref {} page {} is not listed in page_refs", + chunk.id, id, page + ))); } + backed_pages.insert(*page); } for (idx, bbox) in chunk.bboxes.iter().enumerate() { - if !refs.page_ids.contains(bbox.page.as_str()) { + let Some(page_bounds) = refs.page_bounds.get(bbox.page.as_str()) else { return Err(Failure::Usage(format!( "chunk {} bboxes[{}] references unknown page_ref {}", chunk.id, idx, bbox.page ))); + }; + if !page_refs.contains(bbox.page.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} bboxes[{}] page {} is not listed in page_refs", + chunk.id, idx, bbox.page + ))); + } + let [x0, y0, x1, y1] = bbox.bbox.to_array(); + if x0 >= x1 || y0 >= y1 { + return Err(Failure::Usage(format!( + "chunk {} bboxes[{}] has zero area", + chunk.id, idx + ))); + } + if x0 < 0 || y0 < 0 || x1 > page_bounds.width || y1 > page_bounds.height { + return Err(Failure::Usage(format!( + "chunk {} bboxes[{}] exceeds page {} bounds", + chunk.id, idx, bbox.page + ))); + } + backed_pages.insert(bbox.page.as_str()); + } + for id in page_refs { + if !backed_pages.contains(id) { + return Err(Failure::Usage(format!( + "chunk {} page_ref {} is not backed by element_refs or bboxes", + chunk.id, id + ))); } } for id in &chunk.warning_refs { diff --git a/crates/ethos-cli/tests/rag.rs b/crates/ethos-cli/tests/rag.rs index 1aff0f5..4ffd767 100644 --- a/crates/ethos-cli/tests/rag.rs +++ b/crates/ethos-cli/tests/rag.rs @@ -72,6 +72,28 @@ fn document_with_mutated_chunk(name: &str, mutate: impl FnOnce(&mut Value)) -> P ) } +fn append_next_page(doc: &mut Value) -> String { + let pages = doc["payload"]["pages"] + .as_array_mut() + .expect("fixture pages are an array"); + let first_page = pages.first().expect("fixture has at least one page"); + let first_id = first_page["id"] + .as_str() + .expect("fixture page id is a string"); + let next_id = format!( + "p{:04}", + first_id[1..] + .parse::() + .expect("fixture page id has numeric suffix") + + 1 + ); + let mut page = first_page.clone(); + page["id"] = serde_json::json!(next_id); + page["index"] = serde_json::json!(pages.len() + 1); + pages.push(page); + next_id +} + #[test] fn rag_chunk_matches_schema_example_jsonl() { let output = run_ethos(&["rag", "chunk", document_example().to_str().unwrap()]); @@ -110,6 +132,141 @@ fn rag_chunk_output_is_byte_identical_across_runs() { assert_eq!(first.stdout, second.stdout); } +#[test] +fn rag_chunk_rejects_empty_chunk_element_refs() { + let document = document_with_mutated_chunk("empty-chunk-element-refs-document", |doc| { + doc["payload"]["chunks"][0]["element_refs"] = serde_json::json!([]); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 must include at least one element_ref")); +} + +#[test] +fn rag_chunk_rejects_empty_chunk_page_refs() { + let document = document_with_mutated_chunk("empty-chunk-page-refs-document", |doc| { + doc["payload"]["chunks"][0]["page_refs"] = serde_json::json!([]); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 must include at least one page_ref")); +} + +#[test] +fn rag_chunk_rejects_empty_chunk_bboxes() { + let document = document_with_mutated_chunk("empty-chunk-bboxes-document", |doc| { + doc["payload"]["chunks"][0]["bboxes"] = serde_json::json!([]); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr) + .contains("chunk c000001 must include at least one bbox")); +} + +#[test] +fn rag_chunk_rejects_chunk_bbox_page_not_listed_in_page_refs() { + let mut expected_page_id = String::new(); + let document = document_with_mutated_chunk("chunk-bbox-page-not-listed-document", |doc| { + expected_page_id = append_next_page(doc); + doc["payload"]["chunks"][0]["bboxes"][0]["page"] = + serde_json::json!(expected_page_id.clone()); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains(&format!( + "chunk c000001 bboxes[0] page {expected_page_id} is not listed in page_refs" + ))); +} + +#[test] +fn rag_chunk_rejects_chunk_element_page_not_listed_in_page_refs() { + let mut expected_element_id = String::new(); + let mut expected_page_id = String::new(); + let document = document_with_mutated_chunk("chunk-element-page-not-listed-document", |doc| { + expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0] + .as_str() + .expect("fixture chunk element_ref is a string") + .to_string(); + expected_page_id = append_next_page(doc); + doc["payload"]["elements"][0]["page"] = serde_json::json!(expected_page_id.clone()); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains(&format!( + "chunk c000001 element_ref {expected_element_id} page {expected_page_id} is not listed in page_refs" + ))); +} + +#[test] +fn rag_chunk_rejects_unbacked_chunk_page_ref() { + let mut expected_page_id = String::new(); + let document = document_with_mutated_chunk("unbacked-chunk-page-ref-document", |doc| { + expected_page_id = append_next_page(doc); + doc["payload"]["chunks"][0]["page_refs"] + .as_array_mut() + .expect("fixture chunk page_refs are an array") + .push(serde_json::json!(expected_page_id.clone())); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains(&format!( + "chunk c000001 page_ref {expected_page_id} is not backed by element_refs or bboxes" + ))); +} + +#[test] +fn rag_chunk_rejects_zero_area_chunk_bbox() { + let document = document_with_mutated_chunk("zero-area-chunk-bbox-document", |doc| { + let x0 = doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][0] + .as_i64() + .expect("fixture bbox x0 is an integer"); + doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][2] = serde_json::json!(x0); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!( + String::from_utf8_lossy(&output.stderr).contains("chunk c000001 bboxes[0] has zero area") + ); +} + +#[test] +fn rag_chunk_rejects_out_of_bounds_chunk_bbox() { + let mut expected_page_id = String::new(); + let document = document_with_mutated_chunk("out-of-bounds-chunk-bbox-document", |doc| { + expected_page_id = doc["payload"]["chunks"][0]["bboxes"][0]["page"] + .as_str() + .expect("fixture bbox page is a string") + .to_string(); + let page_width = doc["payload"]["pages"][0]["width"] + .as_i64() + .expect("fixture page width is an integer"); + doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][2] = serde_json::json!(page_width + 1); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains(&format!( + "chunk c000001 bboxes[0] exceeds page {expected_page_id} bounds" + ))); +} + #[test] fn rag_chunk_rejects_unknown_chunk_element_ref() { let document = document_with_mutated_chunk("stale-chunk-element-ref-document", |doc| {