perf: retrieval precision harness — BM25 normalization, coherence boost, score blending

unamedkr · claude · unamedkr · commit efade6edc32b · 2026-03-29T17:21:48.000+09:00
Retrieval precision improvements:
- BM25 min-max normalization instead of 1.0 cap (preserves score discrimination)
- Document coherence boost: chunks from same document get collective boost (+5%/extra)
- Reranker score blending (0.7 reranker + 0.3 fusion) preserves triple-index signal

Generation quality improvements:
- Citation mapping handles [Source N] format, validates range, logs out-of-range
- Sentence-boundary-aware context truncation (Korean + English)

Fact verification improvements:
- Finance metric cross-check: detects numeric mismatches between answer and facts
- Graduated logging: warning level for 2+ hallucinations, info for 1

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quantumrag/core/generate/fact_verifier.py b/quantumrag/core/generate/fact_verifier.py
@@ -169,16 +169,38 @@ def verify_against_facts(
             if metric and value:
                 fact_values[metric] = value
 
+        # Cross-check: if answer mentions a metric with a different value
+        for metric, fact_value in fact_values.items():
+            if metric in answer:
+                # Extract numbers near the metric mention in the answer
+                metric_idx = answer.find(metric)
+                window = answer[max(0, metric_idx - 50) : metric_idx + len(metric) + 100]
+                answer_amounts = _AMOUNT_RE.findall(window)
+                fact_amounts = _AMOUNT_RE.findall(fact_value)
+                if answer_amounts and fact_amounts:
+                    # Normalize: strip commas for comparison
+                    answer_num = answer_amounts[0].replace(",", "")
+                    fact_num = fact_amounts[0].replace(",", "")
+                    if answer_num != fact_num:
+                        warnings.append(
+                            f"'{metric}' 수치 불일치: 답변 '{answer_amounts[0]}' vs "
+                            f"팩트 '{fact_amounts[0]}'"
+                        )
+                        hallucinated.append(metric)
+
     if warnings:
-        logger.info(
-            "fact_verification_failed",
+        log_level = "warning" if len(hallucinated) >= 2 else "info"
+        getattr(logger, log_level)(
+            "fact_verification_issues",
             warnings=warnings,
             hallucinated=hallucinated,
+            count=len(hallucinated),
         )
 
-    # Conservative threshold: only flag as invalid when multiple
-    # hallucinations detected.  A single unknown entity is more likely
-    # a Fact-Index gap than a real hallucination.
+    # Graduated threshold:
+    # - 0 hallucinations: valid, no warnings
+    # - 1 hallucination: valid but with warnings (logged for monitoring)
+    # - 2+ hallucinations: invalid, triggers re-generation
     return VerificationResult(
         is_valid=len(hallucinated) < 2,
         warnings=warnings,
diff --git a/quantumrag/core/generate/generator.py b/quantumrag/core/generate/generator.py
@@ -232,7 +232,9 @@ def _build_context(self, chunks: list[ScoredChunk]) -> str:
             if parts and total_chars + len(part) > budget:
                 remaining = budget - total_chars - len(header) - 10
                 if remaining > 100:
-                    part = f"{header}\n{content[:remaining]}..."
+                    # Truncate at sentence boundary instead of mid-word
+                    truncated = _truncate_at_sentence(content, remaining)
+                    part = f"{header}\n{truncated}"
                     parts.append(part)
                 break
             parts.append(part)
@@ -279,13 +281,24 @@ def _clean_answer(self, text: str) -> str:
         return text.strip()
 
     def _map_citations(self, answer: str, sources: list[Source]) -> list[Source]:
-        """Map [1], [2] citations in answer to sources."""
-        cited_numbers = set(int(m) for m in re.findall(r"\[(\d+)\]", answer))
-        cited_sources = []
-        for i, src in enumerate(sources, 1):
-            if i in cited_numbers:
-                cited_sources.append(src)
-        # If no citations found, include all sources
+        """Map [1], [2] or [Source 1] citations in answer to sources."""
+        # Match both [1] and [Source 1] formats
+        cited_numbers = set(int(m) for m in re.findall(r"\[(?:Source\s+)?(\d+)\]", answer))
+        if not cited_numbers:
+            return sources  # No citations found → return all
+
+        # Filter out-of-range citations
+        max_valid = len(sources)
+        cited_sources = [
+            src for i, src in enumerate(sources, 1) if i in cited_numbers and i <= max_valid
+        ]
+        out_of_range = cited_numbers - set(range(1, max_valid + 1))
+        if out_of_range:
+            logger.warning(
+                "citation_out_of_range",
+                cited=sorted(out_of_range),
+                max_sources=max_valid,
+            )
         return cited_sources if cited_sources else sources
 
     def _insufficient_evidence(
@@ -308,6 +321,30 @@ def _get_insufficient_text(self, query: str, n_docs: int) -> str:
         return _INSUFFICIENT_TEMPLATE_EN.format(n_docs=n_docs)
 
 
+def _truncate_at_sentence(text: str, max_chars: int) -> str:
+    """Truncate text at the last sentence boundary within max_chars.
+
+    Avoids cutting mid-word or mid-sentence, which can confuse the LLM.
+    Handles Korean (다/요/음/임 endings) and English (./!/?).
+    """
+    if len(text) <= max_chars:
+        return text
+    # Cut to max_chars, then find last sentence boundary
+    cut = text[:max_chars]
+    # Look for sentence-ending patterns (Korean & English)
+    # Search backwards from the cut point
+    last_boundary = -1
+    for m in re.finditer(r"[.!?。]\s|[다요음임니까]\.\s|[다요음임니까]\s", cut):
+        last_boundary = m.end()
+    if last_boundary > max_chars * 0.3:  # At least 30% of content preserved
+        return cut[:last_boundary].rstrip() + "..."
+    # Fallback: cut at last whitespace
+    last_space = cut.rfind(" ")
+    if last_space > max_chars * 0.3:
+        return cut[:last_space].rstrip() + "..."
+    return cut.rstrip() + "..."
+
+
 def _format_fact_block(facts: list[dict[str, Any]] | None) -> str:
     """Format structured facts into a verified-data block for context injection.
 
diff --git a/quantumrag/core/retrieve/fusion.py b/quantumrag/core/retrieve/fusion.py
@@ -41,6 +41,8 @@ def __init__(
         # LRU embedding cache — avoids redundant embedding calls for similar queries
         self._embed_cache: dict[str, list[float]] = {}
         self._cache_max = 32
+        # Chunk→document mapping built during search for coherence boost
+        self._chunk_doc_map: dict[str, str] = {}
 
     async def search(
         self,
@@ -89,6 +91,17 @@ async def search(
             original_task, hype_task, bm25_task
         )
 
+        # Build chunk→document map from search results for coherence boost
+        self._chunk_doc_map.clear()
+        for result in original_results:
+            doc_id = (result.metadata or {}).get("document_id", "")
+            if doc_id:
+                self._chunk_doc_map[result.id] = doc_id
+        for result in bm25_results:  # type: ignore[union-attr]
+            doc_id = (getattr(result, "metadata", None) or {}).get("document_id", "")
+            if doc_id:
+                self._chunk_doc_map[result.id] = doc_id
+
         # Map HyPE results back to chunk IDs
         hype_chunk_results = self._map_hype_to_chunks(hype_results)
 
@@ -197,10 +210,25 @@ def _reciprocal_rank_fusion(
             raw_score = max(result.score, 0.0)
             scores[result.id] = scores.get(result.id, 0.0) + w_hype * raw_score / (k + rank + 1)
 
+        # Normalize BM25 scores to [0,1] using min-max within this result set
+        # instead of capping at 1.0 (which loses signal discrimination)
+        bm25_scores = [
+            max(getattr(r, "score", 1.0), 0.0)
+            for r in bm25  # type: ignore[union-attr]
+        ]
+        bm25_max = max(bm25_scores) if bm25_scores else 1.0
+        bm25_min = min(bm25_scores) if bm25_scores else 0.0
+        bm25_range = bm25_max - bm25_min if bm25_max > bm25_min else 1.0
+
         for rank, result in enumerate(bm25):  # type: ignore[assignment]
-            # BM25 scores are not normalized to [0,1], so cap at 1.0
-            raw_score = min(max(result.score, 0.0), 1.0) if hasattr(result, "score") else 1.0
-            scores[result.id] = scores.get(result.id, 0.0) + w_bm25 * raw_score / (k + rank + 1)
+            raw = max(getattr(result, "score", 1.0), 0.0)
+            # Min-max normalization preserves relative differences
+            # When all scores are identical (or single result), use raw/max
+            if bm25_max > bm25_min:
+                normalized = (raw - bm25_min) / bm25_range
+            else:
+                normalized = min(raw / bm25_max, 1.0) if bm25_max > 0 else 1.0
+            scores[result.id] = scores.get(result.id, 0.0) + w_bm25 * normalized / (k + rank + 1)
 
         # Normalize to [0, 1]
         max_score = max(scores.values()) if scores else 1.0
@@ -217,29 +245,52 @@ def _apply_document_coherence_boost(
     ) -> list[tuple[str, float]]:
         """Boost scores when multiple chunks from the same document appear in results.
 
-        If a document has N chunks in the top candidates, each gets a boost
-        proportional to N. This helps when the correct answer is spread across
-        multiple chunks of the same source document.
+        If a document has N chunks in the top candidates, each gets a small
+        boost proportional to N. This helps when the correct answer is spread
+        across multiple chunks of the same source document.
+
+        Uses chunk metadata from BM25 results (which carry document_id) and
+        the vector store's stored metadata to map chunk_id → document_id.
+        Falls back to a prefix heuristic when metadata is unavailable.
         """
-        # Count chunks per document in the top candidates (look at 3x top_k)
-        candidate_pool = fused[: top_k * 3]
-        doc_counts: dict[str, int] = {}
+        if len(fused) < 2:
+            return fused
+
+        candidate_pool = fused[: top_k * 4]
         chunk_to_doc: dict[str, str] = {}
 
+        # Try to resolve document_id via document_store metadata
         for chunk_id, _ in candidate_pool:
-            # chunk_id contains document info — extract via document_store
-            # Use a simple heuristic: chunk IDs from same ingest share a doc prefix
-            # The actual document_id is stored in chunk metadata, but we don't have
-            # it here. Instead, use the score pattern: if multiple chunks score well,
-            # they likely share a document.
-            doc_counts[chunk_id] = doc_counts.get(chunk_id, 0)
-
-        # Without document_id in the fusion result, we can't do document-level
-        # grouping here. Instead, apply a simpler heuristic: if a chunk_id
-        # appears in multiple indexes (original + hype + bm25), it gets a
-        # natural boost from RRF already. The score-weighting above handles
-        # the precision issue. Return unchanged.
-        return fused
+            # Use stored _chunk_doc_map if available (populated during search)
+            doc_id = self._chunk_doc_map.get(chunk_id, "")
+            if not doc_id:
+                # Heuristic fallback: first 12 chars of chunk_id often share
+                # a common prefix for chunks from the same document, but this
+                # is unreliable.  Skip boost for unknown chunks.
+                continue
+            chunk_to_doc[chunk_id] = doc_id
+
+        if not chunk_to_doc:
+            return fused
+
+        # Count chunks per document in candidate pool
+        doc_counts: dict[str, int] = {}
+        for doc_id in chunk_to_doc.values():
+            doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1
+
+        # Apply boost: +5% per additional chunk from same document (max +20%)
+        boosted: list[tuple[str, float]] = []
+        for chunk_id, score in fused:
+            doc_id = chunk_to_doc.get(chunk_id, "")
+            if doc_id and doc_counts.get(doc_id, 1) > 1:
+                n_extra = min(doc_counts[doc_id] - 1, 4)  # Cap at 4 extras
+                boost = 1.0 + n_extra * 0.05
+                boosted.append((chunk_id, score * boost))
+            else:
+                boosted.append((chunk_id, score))
+
+        boosted.sort(key=lambda x: x[1], reverse=True)
+        return boosted
 
 
 class ScoredChunk:
diff --git a/quantumrag/core/retrieve/reranker.py b/quantumrag/core/retrieve/reranker.py
@@ -60,12 +60,14 @@ async def rerank(
             request = RerankRequest(query=query, passages=passages)
             results = ranker.rerank(request)
 
-            # Map back to ScoredChunks with new scores
+            # Blend reranker scores with original fusion scores (0.7/0.3)
+            # to preserve signal from triple-index agreement
             reranked = []
             for result in results[:top_k]:
                 idx = int(result["id"])
                 original = chunks[idx]
-                reranked.append(ScoredChunk(chunk=original.chunk, score=result["score"]))
+                blended = 0.7 * result["score"] + 0.3 * original.score
+                reranked.append(ScoredChunk(chunk=original.chunk, score=blended))
 
             logger.debug("reranked", query_len=len(query), input=len(chunks), output=len(reranked))
             return reranked
@@ -122,7 +124,8 @@ async def rerank(
             reranked: list[ScoredChunk] = []
             for result in response.results:
                 idx = result.index
-                reranked.append(ScoredChunk(chunk=chunks[idx].chunk, score=result.relevance_score))
+                blended = 0.7 * result.relevance_score + 0.3 * chunks[idx].score
+                reranked.append(ScoredChunk(chunk=chunks[idx].chunk, score=blended))
 
             logger.debug(
                 "cohere_reranked", query_len=len(query), input=len(chunks), output=len(reranked)
@@ -184,10 +187,11 @@ async def rerank(
             reranked: list[ScoredChunk] = []
             for result in data["results"]:
                 idx = result["index"]
+                blended = 0.7 * result["relevance_score"] + 0.3 * chunks[idx].score
                 reranked.append(
                     ScoredChunk(
                         chunk=chunks[idx].chunk,
-                        score=result["relevance_score"],
+                        score=blended,
                     )
                 )
 
@@ -247,7 +251,10 @@ async def rerank(
 
             indexed = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
             reranked = [
-                ScoredChunk(chunk=chunks[idx].chunk, score=float(score))
+                ScoredChunk(
+                    chunk=chunks[idx].chunk,
+                    score=0.7 * float(score) + 0.3 * chunks[idx].score,
+                )
                 for idx, score in indexed[:top_k]
             ]
 
diff --git a/tests/unit/test_rerankers_and_budget.py b/tests/unit/test_rerankers_and_budget.py
@@ -64,7 +64,9 @@ async def test_rerank_calls_api_and_returns_reranked(self) -> None:
 
         assert len(result) == 2
         assert result[0].chunk.id == "chunk-3"
-        assert result[0].score == pytest.approx(0.95)
+        # Score is blended: 0.7 * reranker_score + 0.3 * original_score
+        # chunk-3 original=0.7, reranker=0.95 → 0.7*0.95 + 0.3*0.7 = 0.875
+        assert result[0].score == pytest.approx(0.875)
         assert result[1].chunk.id == "chunk-1"
 
         mock_client.rerank.assert_called_once()
@@ -134,7 +136,8 @@ async def test_rerank_calls_api_and_returns_reranked(self) -> None:
 
         assert len(result) == 2
         assert result[0].chunk.id == "chunk-2"
-        assert result[0].score == pytest.approx(0.99)
+        # Blended: 0.7 * 0.99 + 0.3 * 0.8 = 0.933
+        assert result[0].score == pytest.approx(0.933)
         assert result[1].chunk.id == "chunk-0"
 
     @pytest.mark.asyncio