Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions adapters/grounding/opendataloader-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ use serde_json::Value;
/// Adapter version, reported in `ParserIdentity::adapter_version`.
pub const ADAPTER_VERSION: &str = "0.1.0";

const REAL_ODL_MAX_PAGES: u32 = 10_000;

/// Mapping failure: input is valid JSON but not recognizable ODL output.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AdapterError {
Expand Down Expand Up @@ -340,6 +342,9 @@ fn parse_real_odl(root: &Value) -> Result<OdlJsonSource, AdapterError> {
if page_count == 0 {
return Err(err("number of pages must be positive"));
}
if page_count > REAL_ODL_MAX_PAGES {
return Err(err("number of pages exceeds adapter limit"));
}
let kids = root
.get("kids")
.and_then(Value::as_array)
Expand Down Expand Up @@ -816,6 +821,10 @@ mod tests {
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[{"page":4294967296,"bbox":[1,1,2,2]}]}"#,
"element.page must fit u32",
);
assert_error_contains(
r#"{"file name":"huge.pdf","number of pages":4294967295,"kids":[]}"#,
"number of pages exceeds adapter limit",
);
}

#[test]
Expand Down
1 change: 1 addition & 0 deletions crates/ethos-cli/src/cmd/verify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ fn check_reason_label(reason: CheckReason) -> &'static str {
CheckReason::UnsupportedClaimKind => "unsupported_claim_kind",
CheckReason::StaleFingerprint => "stale_fingerprint",
CheckReason::MissingSourceFingerprint => "missing_source_fingerprint",
CheckReason::MissingCitationFingerprint => "missing_citation_fingerprint",
CheckReason::MissingSpanCapability => "missing_span_capability",
CheckReason::MissingTableCapability => "missing_table_capability",
CheckReason::UnknownCoordinateOrigin => "unknown_coordinate_origin",
Expand Down
123 changes: 121 additions & 2 deletions crates/ethos-cli/tests/verify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::process::{Command, Output};
use std::time::{SystemTime, UNIX_EPOCH};

use ethos_core::fingerprint::source_fingerprint;
use ethos_core::model::Document;
use serde_json::Value;

fn ethos_bin() -> &'static str {
Expand Down Expand Up @@ -72,6 +73,46 @@ fn json_file(path: impl AsRef<Path>) -> Value {
serde_json::from_slice(&bytes).expect("JSON fixture parses")
}

fn temp_split_quote_document() -> (PathBuf, String) {
let mut doc = json_file(document_example());
doc["payload"]["elements"] = serde_json::json!([
{
"id": "split-a",
"type": "text_block",
"page": "p0001",
"bbox": [100, 100, 400, 200],
"text": "The alpha trust loop verifies "
},
{
"id": "split-b",
"type": "text_block",
"page": "p0001",
"bbox": [400, 100, 700, 200],
"text": "grounded evidence"
}
]);
doc["payload"]["spans"] = serde_json::json!([]);
doc["payload"]["tables"] = serde_json::json!([]);
doc["payload"]["chunks"] = serde_json::json!([]);
doc["payload"]["regions"] = serde_json::json!([]);
doc["payload"]["security_warnings"] = serde_json::json!([]);
doc["payload"]["parser_warnings"] = serde_json::json!([]);

let mut doc: Document = serde_json::from_value(doc).expect("split quote document parses");
doc.payload_sha256 = doc
.compute_payload_sha256()
.expect("split quote payload hash computes");
doc.fingerprint = doc
.compute_fingerprint()
.expect("split quote document fingerprint computes");
let fingerprint = doc.fingerprint.clone();
let path = temp_json(
"split-quote-native-document",
&serde_json::to_string(&doc).expect("split quote document serializes"),
);
(path, fingerprint)
}

fn pdfium_configured() -> bool {
std::env::var_os("ETHOS_PDFIUM_LIBRARY_PATH")
.map(PathBuf::from)
Expand Down Expand Up @@ -695,6 +736,48 @@ fn native_ethos_verify_produces_non_empty_checks() {
assert_eq!(report["all_evidence_grounded"], false);
}

#[test]
fn native_verify_grounds_split_quote_across_adjacent_elements() {
let (doc, fingerprint) = temp_split_quote_document();
let citations = serde_json::json!({
"document_fingerprint": fingerprint,
"claims": [
{
"kind": "quote",
"text": "The alpha trust loop verifies grounded evidence",
"citation": {
"element_id": "split-b"
}
}
]
});
let citations = temp_json(
"split-quote-citations",
&serde_json::to_string(&citations).unwrap(),
);
let report = parse_success(&[
"verify",
doc.to_str().unwrap(),
"--citations",
citations.to_str().unwrap(),
]);

assert_eq!(report["all_evidence_grounded"], true);
assert_eq!(report["checks"][0]["status"], "grounded");
assert_eq!(
report["checks"][0]["match_method"],
"normalized_text_contains"
);
assert_eq!(
report["checks"][0]["evidence"]["text"],
"The alpha trust loop verifies grounded evidence"
);
assert_eq!(
report["checks"][0]["evidence"]["bbox"],
serde_json::json!([100, 100, 700, 200])
);
}

#[test]
fn opendataloader_verify_adapter_produces_capability_aware_report() {
let grounding = odl_example();
Expand Down Expand Up @@ -1137,8 +1220,44 @@ fn bare_array_citation_input_works() {
]);

assert_eq!(report["checks"].as_array().unwrap().len(), 1);
assert_eq!(report["checks"][0]["status"], "grounded");
assert_eq!(report["all_evidence_grounded"], true);
assert_eq!(report["checks"][0]["status"], "stale");
assert_eq!(
report["checks"][0]["reason"],
"missing_citation_fingerprint"
);
assert_eq!(report["all_evidence_grounded"], false);
}

#[test]
fn envelope_without_fingerprint_blocks_when_source_has_fingerprint() {
let doc = document_example();
let citations = temp_json(
"no-fingerprint-envelope-citations",
r#"{
"claims": [
{
"kind": "presence",
"citation": {
"element_id": "e000002"
}
}
]
}"#,
);
let report = parse_success(&[
"verify",
doc.to_str().unwrap(),
"--citations",
citations.to_str().unwrap(),
]);

assert_eq!(report["checks"].as_array().unwrap().len(), 1);
assert_eq!(report["checks"][0]["status"], "stale");
assert_eq!(
report["checks"][0]["reason"],
"missing_citation_fingerprint"
);
assert_eq!(report["all_evidence_grounded"], false);
}

#[test]
Expand Down
2 changes: 2 additions & 0 deletions crates/ethos-core/src/verify_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ pub enum CheckReason {
StaleFingerprint,
/// Citation was fingerprint-pinned but the source did not declare one.
MissingSourceFingerprint,
/// Staleness policy requires a citation fingerprint, but input omitted it.
MissingCitationFingerprint,
/// Span locator was used with a source that does not expose spans.
MissingSpanCapability,
/// Table-cell locator was used with a source that does not expose tables.
Expand Down
Loading
Loading