Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 118 additions & 4 deletions crates/ethos-cli/src/cmd/rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

use std::collections::{BTreeMap, BTreeSet};

use ethos_core::codes::WarningCode;
use ethos_core::error::EthosError;
use ethos_core::model::{Chunk, Document};

Expand Down Expand Up @@ -49,7 +50,11 @@ struct PageBounds {
struct RagChunkRefs<'a> {
page_bounds: BTreeMap<&'a str, PageBounds>,
element_pages: BTreeMap<&'a str, &'a str>,
warning_codes: BTreeMap<&'a str, &'a str>,
element_span_refs: BTreeMap<&'a str, &'a [String]>,
element_warning_refs: BTreeMap<&'a str, &'a [String]>,
excluded_element_warnings: BTreeMap<&'a str, (&'a str, WarningCode)>,
excluded_span_warnings: BTreeMap<&'a str, (&'a str, WarningCode)>,
warning_codes: BTreeMap<&'a str, WarningCode>,
schema_version: &'a str,
document_fingerprint: &'a str,
source_fingerprint: &'a str,
Expand Down Expand Up @@ -79,12 +84,50 @@ impl<'a> RagChunkRefs<'a> {
.iter()
.map(|element| (element.id.as_str(), element.page.as_str()))
.collect(),
element_span_refs: doc
.payload
.elements
.iter()
.map(|element| (element.id.as_str(), element.span_refs.as_slice()))
.collect(),
element_warning_refs: doc
.payload
.elements
.iter()
.map(|element| (element.id.as_str(), element.warning_refs.as_slice()))
.collect(),
excluded_element_warnings: doc
.payload
.security_warnings
.iter()
.chain(doc.payload.parser_warnings.iter())
.filter(|warning| excludes_from_default_chunks(warning.code))
.filter_map(|warning| {
warning
.element_ref
.as_deref()
.map(|element_ref| (element_ref, (warning.id.as_str(), warning.code)))
})
.collect(),
excluded_span_warnings: doc
.payload
.security_warnings
.iter()
.chain(doc.payload.parser_warnings.iter())
.filter(|warning| excludes_from_default_chunks(warning.code))
.filter_map(|warning| {
warning
.span_ref
.as_deref()
.map(|span_ref| (span_ref, (warning.id.as_str(), warning.code)))
})
.collect(),
warning_codes: doc
.payload
.security_warnings
.iter()
.chain(doc.payload.parser_warnings.iter())
.map(|warning| (warning.id.as_str(), warning.code.as_str()))
.map(|warning| (warning.id.as_str(), warning.code))
.collect(),
schema_version: doc.schema_version.as_str(),
document_fingerprint: doc.fingerprint.as_str(),
Expand All @@ -94,6 +137,15 @@ impl<'a> RagChunkRefs<'a> {
}
}

fn excludes_from_default_chunks(code: WarningCode) -> bool {
matches!(
code,
WarningCode::HiddenTextDetected
| WarningCode::OffPageTextDetected
| WarningCode::LowContrastTextDetected
)
}

fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> {
if chunk.element_refs.is_empty() {
return Err(Failure::Usage(format!(
Expand Down Expand Up @@ -139,6 +191,7 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai
chunk.id, id, page
)));
}
validate_element_default_chunk_warnings(chunk, id, refs)?;
backed_pages.insert(*page);
}
for (idx, bbox) in chunk.bboxes.iter().enumerate() {
Expand Down Expand Up @@ -178,11 +231,72 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai
}
}
for id in &chunk.warning_refs {
if !refs.warning_codes.contains_key(id.as_str()) {
let Some(code) = refs.warning_codes.get(id.as_str()) else {
return Err(Failure::Usage(format!(
"chunk {} references unknown warning_ref {}",
chunk.id, id
)));
};
if excludes_from_default_chunks(*code) {
return Err(Failure::Usage(format!(
"chunk {} references default-excluded warning_ref {} ({})",
chunk.id,
id,
code.as_str()
)));
}
}
Ok(())
}

fn validate_element_default_chunk_warnings(
chunk: &Chunk,
element_ref: &str,
refs: &RagChunkRefs<'_>,
) -> Result<(), Failure> {
for warning_ref in refs
.element_warning_refs
.get(element_ref)
.into_iter()
.flat_map(|warning_refs| warning_refs.iter())
{
let Some(code) = refs.warning_codes.get(warning_ref.as_str()) else {
continue;
};
if excludes_from_default_chunks(*code) {
return Err(Failure::Usage(format!(
"chunk {} element_ref {} carries default-excluded warning_ref {} ({})",
chunk.id,
element_ref,
warning_ref,
code.as_str()
)));
}
}
if let Some((warning_ref, code)) = refs.excluded_element_warnings.get(element_ref) {
return Err(Failure::Usage(format!(
"chunk {} element_ref {} carries default-excluded warning_ref {} ({})",
chunk.id,
element_ref,
warning_ref,
code.as_str()
)));
}
for span_ref in refs
.element_span_refs
.get(element_ref)
.into_iter()
.flat_map(|span_refs| span_refs.iter())
{
if let Some((warning_ref, code)) = refs.excluded_span_warnings.get(span_ref.as_str()) {
return Err(Failure::Usage(format!(
"chunk {} element_ref {} includes span_ref {} with default-excluded warning_ref {} ({})",
chunk.id,
element_ref,
span_ref,
warning_ref,
code.as_str()
)));
}
}
Ok(())
Expand Down Expand Up @@ -245,7 +359,7 @@ fn rag_chunk_record(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<serde_json
.warning_codes
.get(id.as_str())
.expect("chunk warning_refs validated before record serialization");
warnings.push(serde_json::Value::String(code.to_string()));
warnings.push(serde_json::Value::String(code.as_str().to_string()));
}
record.insert("warnings".into(), serde_json::Value::Array(warnings));
Ok(serde_json::Value::Object(record))
Expand Down
114 changes: 114 additions & 0 deletions crates/ethos-cli/tests/rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,117 @@ fn rag_chunk_rejects_unknown_chunk_warning_ref() {
assert!(String::from_utf8_lossy(&output.stderr)
.contains("chunk c000001 references unknown warning_ref w999999"));
}

#[test]
fn rag_chunk_rejects_default_excluded_chunk_warning_refs() {
for code in [
"hidden_text_detected",
"off_page_text_detected",
"low_contrast_text_detected",
] {
let document = document_with_mutated_chunk(
&format!("default-excluded-{code}-chunk-document"),
|doc| {
doc["payload"]["security_warnings"][0]["code"] = serde_json::json!(code);
doc["payload"]["chunks"][0]["warning_refs"] = serde_json::json!(["w0001"]);
},
);
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 references default-excluded warning_ref w0001 ({code})"
)));
}
}

#[test]
fn rag_chunk_allows_non_exclusion_security_warning_ref() {
let document = document_with_mutated_chunk("non-exclusion-security-warning-document", |doc| {
doc["payload"]["security_warnings"][0]["code"] = serde_json::json!("annotations_present");
doc["payload"]["chunks"][0]["warning_refs"] = serde_json::json!(["w0001"]);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert!(
output.status.success(),
"ethos rag chunk failed\nstatus: {:?}\nstderr:\n{}",
output.status.code(),
String::from_utf8_lossy(&output.stderr)
);
assert_eq!(output.stderr, b"");
assert!(
String::from_utf8_lossy(&output.stdout).contains(r#""warnings":["annotations_present"]"#)
);
}

#[test]
fn rag_chunk_rejects_default_excluded_element_warning_ref() {
let mut expected_element_id = String::new();
let document =
document_with_mutated_chunk("default-excluded-element-warning-document", |doc| {
expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0]
.as_str()
.expect("fixture chunk element_ref is a string")
.to_string();
doc["payload"]["elements"][0]["warning_refs"] = serde_json::json!(["w0001"]);
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 element_ref {expected_element_id} carries default-excluded warning_ref w0001 (hidden_text_detected)"
)));
}

#[test]
fn rag_chunk_rejects_default_excluded_warning_attached_to_cited_element() {
let mut expected_element_id = String::new();
let document = document_with_mutated_chunk(
"default-excluded-attached-element-warning-document",
|doc| {
expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0]
.as_str()
.expect("fixture chunk element_ref is a string")
.to_string();
doc["payload"]["security_warnings"][0]["element_ref"] =
serde_json::json!(expected_element_id.clone());
},
);
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 element_ref {expected_element_id} carries default-excluded warning_ref w0001 (hidden_text_detected)"
)));
}

#[test]
fn rag_chunk_rejects_default_excluded_span_warning_reached_by_cited_element() {
let mut expected_element_id = String::new();
let mut expected_span_id = String::new();
let document = document_with_mutated_chunk("default-excluded-span-warning-document", |doc| {
expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0]
.as_str()
.expect("fixture chunk element_ref is a string")
.to_string();
expected_span_id = doc["payload"]["security_warnings"][0]["span_ref"]
.as_str()
.expect("fixture security warning span_ref is a string")
.to_string();
doc["payload"]["elements"][0]["span_refs"]
.as_array_mut()
.expect("fixture element span_refs are an array")
.push(serde_json::json!(expected_span_id.clone()));
});
let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]);

assert_eq!(output.status.code(), Some(2));
assert_eq!(output.stdout, b"");
assert!(String::from_utf8_lossy(&output.stderr).contains(&format!(
"chunk c000001 element_ref {expected_element_id} includes span_ref {expected_span_id} with default-excluded warning_ref w0001 (hidden_text_detected)"
)));
}
7 changes: 2 additions & 5 deletions schemas/ethos-chunks.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "urn:ethos:schema:chunks:1",
"title": "Ethos chunk record (one line of chunks.jsonl)",
"description": "Self-describing RAG chunk: each JSONL line carries provenance so a chunk can be cited and later verified without the full document at hand. Derived deterministically from the canonical document (ethos.json) — same document + same config => byte-identical chunks.jsonl. Warning codes are inlined (not refs) so RAG consumers need no join; hidden/off-page/low-contrast content is never present in default chunks.",
"description": "Self-describing default RAG chunk: each JSONL line carries provenance so a chunk can be cited and later verified without the full document at hand. Derived deterministically from the canonical document (ethos.json) — same document + same config => byte-identical chunks.jsonl. Warning codes are inlined (not refs) so RAG consumers need no join; hidden/off-page/low-contrast content is never present in default chunks.",
"type": "object",
"required": [
"schema_version",
Expand Down Expand Up @@ -52,14 +52,11 @@
},
"warnings": {
"type": "array",
"description": "Inherited warning codes from source regions (e.g. low_confidence_table_structure).",
"description": "Inherited warning codes from source regions that are still valid in default chunks. Text-exclusion security codes hidden_text_detected, off_page_text_detected, and low_contrast_text_detected are omitted from default chunks.",
"items": {
"enum": [
"low_confidence_reading_order",
"low_confidence_table_structure",
"hidden_text_detected",
"off_page_text_detected",
"low_contrast_text_detected",
"annotations_present",
"external_links_present",
"image_only_page",
Expand Down
12 changes: 12 additions & 0 deletions schemas/validate_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,20 @@ def c14n_line(v) -> str:


wcodes = {w["id"]: w["code"] for w in p["security_warnings"] + p["parser_warnings"]}
default_chunk_excluded_warning_codes = {
"hidden_text_detected",
"off_page_text_detected",
"low_contrast_text_detected",
}
expected_lines = []
for ch in p["chunks"]:
for warning_ref in ch.get("warning_refs", []):
code = wcodes[warning_ref]
if code in default_chunk_excluded_warning_codes:
fail(
"document.example.json: "
f"chunk {ch['id']} references default-excluded warning_ref {warning_ref} ({code})"
)
expected_lines.append(c14n_line({
"schema_version": doc["schema_version"],
"document_fingerprint": doc["fingerprint"],
Expand Down
Loading