Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ Run the evaluation harness against a test set of questions:

```bash
yore eval --questions questions.jsonl --index docs/.index
yore eval --questions questions.jsonl --index docs/.index --json --k 3,5,10
```

### 6.6 Link and structure analysis
Expand Down Expand Up @@ -652,7 +653,7 @@ yore mcp fetch-context ctx_d76396f763601873 \
Evaluates the retrieval pipeline against a set of test questions.

```bash
yore eval --questions <jsonl-file> --index <index-dir>
yore eval --questions <jsonl-file> --index <index-dir> [--k <k1,k2,...>]
```

Each line in the JSONL file represents a test question:
Expand All @@ -661,18 +662,26 @@ Each line in the JSONL file represents a test question:
{"id": 1, "q": "How does auth work?", "expect": ["session", "token"], "min_hits": 2}
```

Yore assembles context for each question, checks for expected substrings, and reports per‑question hits and an overall pass rate.
To measure ranked retrieval quality, add `relevant_docs` to a question:

```json
{"id": 2, "q": "deployment steps", "expect": ["docker"], "relevant_docs": ["docs/guides/deployment.md"]}
```

Yore assembles context for each question, checks for expected substrings, and reports per‑question hits and an overall pass rate. When `relevant_docs` is present, yore also computes precision@k, recall@k, MRR, and nDCG@k over the initial BM25 retrieval ranking. Questions without `relevant_docs` produce the existing output only (backward compatible).

**Key options**

* `--questions` – Path to questions JSONL file (default: `questions.jsonl`)
* `--index` – Index directory (default: `.yore`)
* `--json` – Emit JSON output
* `--k` – Comma‑separated k values for precision@k, recall@k, nDCG@k (default: `5,10`)

**Example**

```bash
yore eval --questions questions.jsonl --index docs/.index
yore eval --questions questions.jsonl --index docs/.index --json --k 3,5,10
```

---
Expand Down
4 changes: 4 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,10 @@ pub enum Commands {
/// Output as JSON
#[arg(long)]
json: bool,

/// Values of k for precision@k, recall@k, nDCG@k (comma-separated)
#[arg(long, value_delimiter = ',', default_values_t = vec![5, 10])]
k: Vec<usize>,
},

/// Derive a deterministic vocabulary list from a built index.
Expand Down
66 changes: 64 additions & 2 deletions src/commands_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ pub(crate) fn cmd_eval(
questions_path: &Path,
index_dir: &Path,
json: bool,
k_values: &[usize],
) -> Result<(), Box<dyn std::error::Error>> {
// Load questions from JSONL file
let questions_content = fs::read_to_string(questions_path)?;
Expand Down Expand Up @@ -676,6 +677,9 @@ pub(crate) fn cmd_eval(
// Run assemble internally (capture output as string)
let primary_sections = search_relevant_sections(&question.q, &forward_index, 20);

// Compute ranked doc list from initial BM25 retrieval
let ranked_docs = unique_doc_ranking(&primary_sections);

if primary_sections.is_empty() {
results.push(EvalResult {
id: question.id,
Expand All @@ -684,6 +688,9 @@ pub(crate) fn cmd_eval(
total: question.expect.len(),
passed: false,
tokens: 0,
ranked_docs,
ranking: None,
digest: String::new(),
});
continue;
}
Expand Down Expand Up @@ -742,13 +749,22 @@ pub(crate) fn cmd_eval(
let passed = hits >= min_hits;
let tokens = estimate_tokens(&digest);

// Compute ranking metrics if relevant_docs is provided
let ranking = question.relevant_docs.as_ref().map(|rel_docs| {
let relevant_set: HashSet<String> = rel_docs.iter().cloned().collect();
compute_ranking_metrics(&ranked_docs, &relevant_set, k_values)
});

results.push(EvalResult {
id: question.id,
question: question.q.clone(),
hits,
total: question.expect.len(),
passed,
tokens,
ranked_docs,
ranking,
digest,
});
}

Expand All @@ -757,6 +773,15 @@ pub(crate) fn cmd_eval(
let total = results.len();
let pass_rate_pct = passed_count as f64 / total as f64 * 100.0;

// Compute aggregate ranking metrics across questions that have relevance data
let per_question_rankings: Vec<RankingMetrics> =
results.iter().filter_map(|r| r.ranking.clone()).collect();
let aggregate = if per_question_rankings.is_empty() {
None
} else {
Some(aggregate_ranking_metrics(&per_question_rankings, k_values))
};

if json {
let json_results: Vec<EvalQuestionResult> = results
.iter()
Expand All @@ -766,14 +791,15 @@ pub(crate) fn cmd_eval(
.find(|q| q.id == r.id)
.map(|q| q.expect.clone())
.unwrap_or_default();
let digest_lower = r.digest.to_lowercase();
let found: Vec<String> = expected
.iter()
.filter(|e| r.question.to_lowercase().contains(&e.to_lowercase()))
.filter(|e| digest_lower.contains(&e.to_lowercase()))
.cloned()
.collect();
let missing: Vec<String> = expected
.iter()
.filter(|e| !r.question.to_lowercase().contains(&e.to_lowercase()))
.filter(|e| !digest_lower.contains(&e.to_lowercase()))
.cloned()
.collect();
EvalQuestionResult {
Expand All @@ -782,6 +808,7 @@ pub(crate) fn cmd_eval(
expected,
found,
missing,
ranking: r.ranking.clone(),
}
})
.collect();
Expand All @@ -793,6 +820,7 @@ pub(crate) fn cmd_eval(
failed: total - passed_count,
pass_rate: pass_rate_pct,
results: json_results,
ranking_metrics: aggregate,
};
println!("{}", serde_json::to_string_pretty(&output)?);
return Ok(());
Expand All @@ -813,6 +841,20 @@ pub(crate) fn cmd_eval(
println!("[{}] {}", result.id, result.question.white().bold());
println!(" - hits: {}/{} {}", result.hits, result.total, status);
println!(" - size: {} tokens", result.tokens);

if let Some(ranking) = &result.ranking {
println!(" - MRR: {:.3}", ranking.mrr);
for m in &ranking.precision_at_k {
println!(" - P@{}: {:.3}", m.k, m.value);
}
for m in &ranking.recall_at_k {
println!(" - R@{}: {:.3}", m.k, m.value);
}
for m in &ranking.ndcg_at_k {
println!(" - nDCG@{}: {:.3}", m.k, m.value);
}
}

println!();
}

Expand All @@ -821,6 +863,26 @@ pub(crate) fn cmd_eval(
println!("{}", "Summary".cyan().bold());
println!(" Passed: {passed_count}/{total} ({pass_rate_pct:.0}%)");
println!(" Failed: {}/{}", total - passed_count, total);

if let Some(agg) = &aggregate {
println!();
println!("{}", "Ranking Metrics (aggregate)".cyan().bold());
println!(
" Questions with relevance data: {}",
agg.questions_with_relevance
);
println!(" Mean MRR: {:.3}", agg.mean_mrr);
for m in &agg.mean_precision_at_k {
println!(" Mean P@{}: {:.3}", m.k, m.value);
}
for m in &agg.mean_recall_at_k {
println!(" Mean R@{}: {:.3}", m.k, m.value);
}
for m in &agg.mean_ndcg_at_k {
println!(" Mean nDCG@{}: {:.3}", m.k, m.value);
}
}

println!();

if passed_count < total {
Expand Down
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,8 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
questions,
index,
json,
} => cmd_eval(&questions, &index, json),
k,
} => cmd_eval(&questions, &index, json, &k),
Commands::Vocabulary {
index,
limit,
Expand Down
Loading
Loading