rahulrajaram · rahulrajaram · Mar 27, 2026 · Mar 27, 2026
diff --git a/README.md b/README.md
@@ -231,6 +231,7 @@ Run the evaluation harness against a test set of questions:
 
 ```bash
 yore eval --questions questions.jsonl --index docs/.index
+yore eval --questions questions.jsonl --index docs/.index --json --k 3,5,10
 ```
 
 ### 6.6 Link and structure analysis
@@ -652,7 +653,7 @@ yore mcp fetch-context ctx_d76396f763601873 \
 Evaluates the retrieval pipeline against a set of test questions.
 
 ```bash
-yore eval --questions <jsonl-file> --index <index-dir>
+yore eval --questions <jsonl-file> --index <index-dir> [--k <k1,k2,...>]
 ```
 
 Each line in the JSONL file represents a test question:
@@ -661,18 +662,26 @@ Each line in the JSONL file represents a test question:
 {"id": 1, "q": "How does auth work?", "expect": ["session", "token"], "min_hits": 2}
 ```
 
-Yore assembles context for each question, checks for expected substrings, and reports per‑question hits and an overall pass rate.
+To measure ranked retrieval quality, add `relevant_docs` to a question:
+
+```json
+{"id": 2, "q": "deployment steps", "expect": ["docker"], "relevant_docs": ["docs/guides/deployment.md"]}
+```
+
+Yore assembles context for each question, checks for expected substrings, and reports per‑question hits and an overall pass rate. When `relevant_docs` is present, yore also computes precision@k, recall@k, MRR, and nDCG@k over the initial BM25 retrieval ranking. Questions without `relevant_docs` produce the existing output only (backward compatible).
 
 **Key options**
 
 * `--questions` – Path to questions JSONL file (default: `questions.jsonl`)
 * `--index` – Index directory (default: `.yore`)
 * `--json` – Emit JSON output
+* `--k` – Comma‑separated k values for precision@k, recall@k, nDCG@k (default: `5,10`)
 
 **Example**
 
 ```bash
 yore eval --questions questions.jsonl --index docs/.index
+yore eval --questions questions.jsonl --index docs/.index --json --k 3,5,10
 ```
 
 ---

diff --git a/src/cli.rs b/src/cli.rs
@@ -603,6 +603,10 @@ pub enum Commands {
         /// Output as JSON
         #[arg(long)]
         json: bool,
+
+        /// Values of k for precision@k, recall@k, nDCG@k (comma-separated)
+        #[arg(long, value_delimiter = ',', default_values_t = vec![5, 10])]
+        k: Vec<usize>,
     },
 
     /// Derive a deterministic vocabulary list from a built index.

diff --git a/src/commands_text.rs b/src/commands_text.rs
@@ -645,6 +645,7 @@ pub(crate) fn cmd_eval(
     questions_path: &Path,
     index_dir: &Path,
     json: bool,
+    k_values: &[usize],
 ) -> Result<(), Box<dyn std::error::Error>> {
     // Load questions from JSONL file
     let questions_content = fs::read_to_string(questions_path)?;
@@ -676,6 +677,9 @@ pub(crate) fn cmd_eval(
         // Run assemble internally (capture output as string)
         let primary_sections = search_relevant_sections(&question.q, &forward_index, 20);
 
+        // Compute ranked doc list from initial BM25 retrieval
+        let ranked_docs = unique_doc_ranking(&primary_sections);
+
         if primary_sections.is_empty() {
             results.push(EvalResult {
                 id: question.id,
@@ -684,6 +688,9 @@ pub(crate) fn cmd_eval(
                 total: question.expect.len(),
                 passed: false,
                 tokens: 0,
+                ranked_docs,
+                ranking: None,
+                digest: String::new(),
             });
             continue;
         }
@@ -742,13 +749,22 @@ pub(crate) fn cmd_eval(
         let passed = hits >= min_hits;
         let tokens = estimate_tokens(&digest);
 
+        // Compute ranking metrics if relevant_docs is provided
+        let ranking = question.relevant_docs.as_ref().map(|rel_docs| {
+            let relevant_set: HashSet<String> = rel_docs.iter().cloned().collect();
+            compute_ranking_metrics(&ranked_docs, &relevant_set, k_values)
+        });
+
         results.push(EvalResult {
             id: question.id,
             question: question.q.clone(),
             hits,
             total: question.expect.len(),
             passed,
             tokens,
+            ranked_docs,
+            ranking,
+            digest,
         });
     }
 
@@ -757,6 +773,15 @@ pub(crate) fn cmd_eval(
     let total = results.len();
     let pass_rate_pct = passed_count as f64 / total as f64 * 100.0;
 
+    // Compute aggregate ranking metrics across questions that have relevance data
+    let per_question_rankings: Vec<RankingMetrics> =
+        results.iter().filter_map(|r| r.ranking.clone()).collect();
+    let aggregate = if per_question_rankings.is_empty() {
+        None
+    } else {
+        Some(aggregate_ranking_metrics(&per_question_rankings, k_values))
+    };
+
     if json {
         let json_results: Vec<EvalQuestionResult> = results
             .iter()
@@ -766,14 +791,15 @@ pub(crate) fn cmd_eval(
                     .find(|q| q.id == r.id)
                     .map(|q| q.expect.clone())
                     .unwrap_or_default();
+                let digest_lower = r.digest.to_lowercase();
                 let found: Vec<String> = expected
                     .iter()
-                    .filter(|e| r.question.to_lowercase().contains(&e.to_lowercase()))
+                    .filter(|e| digest_lower.contains(&e.to_lowercase()))
                     .cloned()
                     .collect();
                 let missing: Vec<String> = expected
                     .iter()
-                    .filter(|e| !r.question.to_lowercase().contains(&e.to_lowercase()))
+                    .filter(|e| !digest_lower.contains(&e.to_lowercase()))
                     .cloned()
                     .collect();
                 EvalQuestionResult {
@@ -782,6 +808,7 @@ pub(crate) fn cmd_eval(
                     expected,
                     found,
                     missing,
+                    ranking: r.ranking.clone(),
                 }
             })
             .collect();
@@ -793,6 +820,7 @@ pub(crate) fn cmd_eval(
             failed: total - passed_count,
             pass_rate: pass_rate_pct,
             results: json_results,
+            ranking_metrics: aggregate,
         };
         println!("{}", serde_json::to_string_pretty(&output)?);
         return Ok(());
@@ -813,6 +841,20 @@ pub(crate) fn cmd_eval(
         println!("[{}] {}", result.id, result.question.white().bold());
         println!("  - hits: {}/{} {}", result.hits, result.total, status);
         println!("  - size: {} tokens", result.tokens);
+
+        if let Some(ranking) = &result.ranking {
+            println!("  - MRR: {:.3}", ranking.mrr);
+            for m in &ranking.precision_at_k {
+                println!("  - P@{}: {:.3}", m.k, m.value);
+            }
+            for m in &ranking.recall_at_k {
+                println!("  - R@{}: {:.3}", m.k, m.value);
+            }
+            for m in &ranking.ndcg_at_k {
+                println!("  - nDCG@{}: {:.3}", m.k, m.value);
+            }
+        }
+
         println!();
     }
 
@@ -821,6 +863,26 @@ pub(crate) fn cmd_eval(
     println!("{}", "Summary".cyan().bold());
     println!("  Passed: {passed_count}/{total} ({pass_rate_pct:.0}%)");
     println!("  Failed: {}/{}", total - passed_count, total);
+
+    if let Some(agg) = &aggregate {
+        println!();
+        println!("{}", "Ranking Metrics (aggregate)".cyan().bold());
+        println!(
+            "  Questions with relevance data: {}",
+            agg.questions_with_relevance
+        );
+        println!("  Mean MRR: {:.3}", agg.mean_mrr);
+        for m in &agg.mean_precision_at_k {
+            println!("  Mean P@{}: {:.3}", m.k, m.value);
+        }
+        for m in &agg.mean_recall_at_k {
+            println!("  Mean R@{}: {:.3}", m.k, m.value);
+        }
+        for m in &agg.mean_ndcg_at_k {
+            println!("  Mean nDCG@{}: {:.3}", m.k, m.value);
+        }
+    }
+
     println!();
 
     if passed_count < total {

diff --git a/src/main.rs b/src/main.rs
@@ -370,7 +370,8 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
             questions,
             index,
             json,
-        } => cmd_eval(&questions, &index, json),
+            k,
+        } => cmd_eval(&questions, &index, json, &k),
         Commands::Vocabulary {
             index,
             limit,