Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 85 additions & 13 deletions crates/ethos-cli/src/cmd/rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,15 @@ fn rag_chunk_output_bytes(doc: &Document) -> Result<Vec<u8>, Failure> {
Ok(out)
}

#[derive(Clone, Copy)]
struct PageBounds {
width: i64,
height: i64,
}

struct RagChunkRefs<'a> {
page_ids: BTreeSet<&'a str>,
element_ids: BTreeSet<&'a str>,
page_bounds: BTreeMap<&'a str, PageBounds>,
element_pages: BTreeMap<&'a str, &'a str>,
warning_codes: BTreeMap<&'a str, &'a str>,
schema_version: &'a str,
document_fingerprint: &'a str,
Expand All @@ -53,17 +59,25 @@ struct RagChunkRefs<'a> {
impl<'a> RagChunkRefs<'a> {
fn new(doc: &'a Document) -> Self {
Self {
page_ids: doc
page_bounds: doc
.payload
.pages
.iter()
.map(|page| page.id.as_str())
.map(|page| {
(
page.id.as_str(),
PageBounds {
width: page.width,
height: page.height,
},
)
})
.collect(),
element_ids: doc
element_pages: doc
.payload
.elements
.iter()
.map(|element| element.id.as_str())
.map(|element| (element.id.as_str(), element.page.as_str()))
.collect(),
warning_codes: doc
.payload
Expand All @@ -81,28 +95,86 @@ impl<'a> RagChunkRefs<'a> {
}

fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> {
for id in &chunk.element_refs {
if !refs.element_ids.contains(id.as_str()) {
if chunk.element_refs.is_empty() {
return Err(Failure::Usage(format!(
"chunk {} must include at least one element_ref",
chunk.id
)));
}
if chunk.page_refs.is_empty() {
return Err(Failure::Usage(format!(
"chunk {} must include at least one page_ref",
chunk.id
)));
}
if chunk.bboxes.is_empty() {
return Err(Failure::Usage(format!(
"chunk {} must include at least one bbox",
chunk.id
)));
}

let mut page_refs = BTreeSet::new();
for id in &chunk.page_refs {
if !refs.page_bounds.contains_key(id.as_str()) {
return Err(Failure::Usage(format!(
"chunk {} references unknown element_ref {}",
"chunk {} references unknown page_ref {}",
chunk.id, id
)));
}
page_refs.insert(id.as_str());
}
for id in &chunk.page_refs {
if !refs.page_ids.contains(id.as_str()) {

let mut backed_pages = BTreeSet::new();
for id in &chunk.element_refs {
let Some(page) = refs.element_pages.get(id.as_str()) else {
return Err(Failure::Usage(format!(
"chunk {} references unknown page_ref {}",
"chunk {} references unknown element_ref {}",
chunk.id, id
)));
};
if !page_refs.contains(page) {
return Err(Failure::Usage(format!(
"chunk {} element_ref {} page {} is not listed in page_refs",
chunk.id, id, page
)));
}
backed_pages.insert(*page);
}
for (idx, bbox) in chunk.bboxes.iter().enumerate() {
if !refs.page_ids.contains(bbox.page.as_str()) {
let Some(page_bounds) = refs.page_bounds.get(bbox.page.as_str()) else {
return Err(Failure::Usage(format!(
"chunk {} bboxes[{}] references unknown page_ref {}",
chunk.id, idx, bbox.page
)));
};
if !page_refs.contains(bbox.page.as_str()) {
return Err(Failure::Usage(format!(
"chunk {} bboxes[{}] page {} is not listed in page_refs",
chunk.id, idx, bbox.page
)));
}
let [x0, y0, x1, y1] = bbox.bbox.to_array();
if x0 >= x1 || y0 >= y1 {
return Err(Failure::Usage(format!(
"chunk {} bboxes[{}] has zero area",
chunk.id, idx
)));
}
if x0 < 0 || y0 < 0 || x1 > page_bounds.width || y1 > page_bounds.height {
return Err(Failure::Usage(format!(
"chunk {} bboxes[{}] exceeds page {} bounds",
chunk.id, idx, bbox.page
)));
}
backed_pages.insert(bbox.page.as_str());
}
for id in page_refs {
if !backed_pages.contains(id) {
return Err(Failure::Usage(format!(
"chunk {} page_ref {} is not backed by element_refs or bboxes",
chunk.id, id
)));
}
}
for id in &chunk.warning_refs {
Expand Down
157 changes: 157 additions & 0 deletions crates/ethos-cli/tests/rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,28 @@ fn document_with_mutated_chunk(name: &str, mutate: impl FnOnce(&mut Value)) -> P
)
}

fn append_next_page(doc: &mut Value) -> String {
let pages = doc["payload"]["pages"]
.as_array_mut()
.expect("fixture pages are an array");
let first_page = pages.first().expect("fixture has at least one page");
let first_id = first_page["id"]
.as_str()
.expect("fixture page id is a string");
let next_id = format!(
"p{:04}",
first_id[1..]
.parse::<u32>()
.expect("fixture page id has numeric suffix")
+ 1
);
let mut page = first_page.clone();
page["id"] = serde_json::json!(next_id);
page["index"] = serde_json::json!(pages.len() + 1);
pages.push(page);
next_id
}

#[test]
fn rag_chunk_matches_schema_example_jsonl() {
let output = run_ethos(&["rag", "chunk", document_example().to_str().unwrap()]);
Expand Down Expand Up @@ -110,6 +132,141 @@ fn rag_chunk_output_is_byte_identical_across_runs() {
assert_eq!(first.stdout, second.stdout);
}

#[test]
fn rag_chunk_rejects_empty_chunk_element_refs() {
let document = document_with_mutated_chunk("empty-chunk-element-refs-document", |doc| {
doc["payload"]["chunks"][0]["element_refs"] = serde_json::json!([]);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr)
.contains("chunk c000001 must include at least one element_ref"));
}

#[test]
fn rag_chunk_rejects_empty_chunk_page_refs() {
let document = document_with_mutated_chunk("empty-chunk-page-refs-document", |doc| {
doc["payload"]["chunks"][0]["page_refs"] = serde_json::json!([]);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr)
.contains("chunk c000001 must include at least one page_ref"));
}

#[test]
fn rag_chunk_rejects_empty_chunk_bboxes() {
let document = document_with_mutated_chunk("empty-chunk-bboxes-document", |doc| {
doc["payload"]["chunks"][0]["bboxes"] = serde_json::json!([]);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr)
.contains("chunk c000001 must include at least one bbox"));
}

#[test]
fn rag_chunk_rejects_chunk_bbox_page_not_listed_in_page_refs() {
let mut expected_page_id = String::new();
let document = document_with_mutated_chunk("chunk-bbox-page-not-listed-document", |doc| {
expected_page_id = append_next_page(doc);
doc["payload"]["chunks"][0]["bboxes"][0]["page"] =
serde_json::json!(expected_page_id.clone());
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 bboxes[0] page {expected_page_id} is not listed in page_refs"
)));
}

#[test]
fn rag_chunk_rejects_chunk_element_page_not_listed_in_page_refs() {
let mut expected_element_id = String::new();
let mut expected_page_id = String::new();
let document = document_with_mutated_chunk("chunk-element-page-not-listed-document", |doc| {
expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0]
.as_str()
.expect("fixture chunk element_ref is a string")
.to_string();
expected_page_id = append_next_page(doc);
doc["payload"]["elements"][0]["page"] = serde_json::json!(expected_page_id.clone());
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 element_ref {expected_element_id} page {expected_page_id} is not listed in page_refs"
)));
}

#[test]
fn rag_chunk_rejects_unbacked_chunk_page_ref() {
let mut expected_page_id = String::new();
let document = document_with_mutated_chunk("unbacked-chunk-page-ref-document", |doc| {
expected_page_id = append_next_page(doc);
doc["payload"]["chunks"][0]["page_refs"]
.as_array_mut()
.expect("fixture chunk page_refs are an array")
.push(serde_json::json!(expected_page_id.clone()));
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 page_ref {expected_page_id} is not backed by element_refs or bboxes"
)));
}

#[test]
fn rag_chunk_rejects_zero_area_chunk_bbox() {
let document = document_with_mutated_chunk("zero-area-chunk-bbox-document", |doc| {
let x0 = doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][0]
.as_i64()
.expect("fixture bbox x0 is an integer");
doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][2] = serde_json::json!(x0);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(
String::from_utf8_lossy(&output.stderr).contains("chunk c000001 bboxes[0] has zero area")
);
}

#[test]
fn rag_chunk_rejects_out_of_bounds_chunk_bbox() {
let mut expected_page_id = String::new();
let document = document_with_mutated_chunk("out-of-bounds-chunk-bbox-document", |doc| {
expected_page_id = doc["payload"]["chunks"][0]["bboxes"][0]["page"]
.as_str()
.expect("fixture bbox page is a string")
.to_string();
let page_width = doc["payload"]["pages"][0]["width"]
.as_i64()
.expect("fixture page width is an integer");
doc["payload"]["chunks"][0]["bboxes"][0]["bbox"][2] = serde_json::json!(page_width + 1);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 bboxes[0] exceeds page {expected_page_id} bounds"
)));
}

#[test]
fn rag_chunk_rejects_unknown_chunk_element_ref() {
let document = document_with_mutated_chunk("stale-chunk-element-ref-document", |doc| {
Expand Down
Loading