Skip to content

Commit efade6e

Browse files
unamedkrclaude
andcommitted
perf: retrieval precision harness — BM25 normalization, coherence boost, score blending
Retrieval precision improvements: - BM25 min-max normalization instead of 1.0 cap (preserves score discrimination) - Document coherence boost: chunks from same document get collective boost (+5%/extra) - Reranker score blending (0.7 reranker + 0.3 fusion) preserves triple-index signal Generation quality improvements: - Citation mapping handles [Source N] format, validates range, logs out-of-range - Sentence-boundary-aware context truncation (Korean + English) Fact verification improvements: - Finance metric cross-check: detects numeric mismatches between answer and facts - Graduated logging: warning level for 2+ hallucinations, info for 1 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f42fe42 commit efade6e

File tree

5 files changed

+162
-42
lines changed

5 files changed

+162
-42
lines changed

quantumrag/core/generate/fact_verifier.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,16 +169,38 @@ def verify_against_facts(
169169
if metric and value:
170170
fact_values[metric] = value
171171

172+
# Cross-check: if answer mentions a metric with a different value
173+
for metric, fact_value in fact_values.items():
174+
if metric in answer:
175+
# Extract numbers near the metric mention in the answer
176+
metric_idx = answer.find(metric)
177+
window = answer[max(0, metric_idx - 50) : metric_idx + len(metric) + 100]
178+
answer_amounts = _AMOUNT_RE.findall(window)
179+
fact_amounts = _AMOUNT_RE.findall(fact_value)
180+
if answer_amounts and fact_amounts:
181+
# Normalize: strip commas for comparison
182+
answer_num = answer_amounts[0].replace(",", "")
183+
fact_num = fact_amounts[0].replace(",", "")
184+
if answer_num != fact_num:
185+
warnings.append(
186+
f"'{metric}' 수치 불일치: 답변 '{answer_amounts[0]}' vs "
187+
f"팩트 '{fact_amounts[0]}'"
188+
)
189+
hallucinated.append(metric)
190+
172191
if warnings:
173-
logger.info(
174-
"fact_verification_failed",
192+
log_level = "warning" if len(hallucinated) >= 2 else "info"
193+
getattr(logger, log_level)(
194+
"fact_verification_issues",
175195
warnings=warnings,
176196
hallucinated=hallucinated,
197+
count=len(hallucinated),
177198
)
178199

179-
# Conservative threshold: only flag as invalid when multiple
180-
# hallucinations detected. A single unknown entity is more likely
181-
# a Fact-Index gap than a real hallucination.
200+
# Graduated threshold:
201+
# - 0 hallucinations: valid, no warnings
202+
# - 1 hallucination: valid but with warnings (logged for monitoring)
203+
# - 2+ hallucinations: invalid, triggers re-generation
182204
return VerificationResult(
183205
is_valid=len(hallucinated) < 2,
184206
warnings=warnings,

quantumrag/core/generate/generator.py

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,9 @@ def _build_context(self, chunks: list[ScoredChunk]) -> str:
232232
if parts and total_chars + len(part) > budget:
233233
remaining = budget - total_chars - len(header) - 10
234234
if remaining > 100:
235-
part = f"{header}\n{content[:remaining]}..."
235+
# Truncate at sentence boundary instead of mid-word
236+
truncated = _truncate_at_sentence(content, remaining)
237+
part = f"{header}\n{truncated}"
236238
parts.append(part)
237239
break
238240
parts.append(part)
@@ -279,13 +281,24 @@ def _clean_answer(self, text: str) -> str:
279281
return text.strip()
280282

281283
def _map_citations(self, answer: str, sources: list[Source]) -> list[Source]:
282-
"""Map [1], [2] citations in answer to sources."""
283-
cited_numbers = set(int(m) for m in re.findall(r"\[(\d+)\]", answer))
284-
cited_sources = []
285-
for i, src in enumerate(sources, 1):
286-
if i in cited_numbers:
287-
cited_sources.append(src)
288-
# If no citations found, include all sources
284+
"""Map [1], [2] or [Source 1] citations in answer to sources."""
285+
# Match both [1] and [Source 1] formats
286+
cited_numbers = set(int(m) for m in re.findall(r"\[(?:Source\s+)?(\d+)\]", answer))
287+
if not cited_numbers:
288+
return sources # No citations found → return all
289+
290+
# Filter out-of-range citations
291+
max_valid = len(sources)
292+
cited_sources = [
293+
src for i, src in enumerate(sources, 1) if i in cited_numbers and i <= max_valid
294+
]
295+
out_of_range = cited_numbers - set(range(1, max_valid + 1))
296+
if out_of_range:
297+
logger.warning(
298+
"citation_out_of_range",
299+
cited=sorted(out_of_range),
300+
max_sources=max_valid,
301+
)
289302
return cited_sources if cited_sources else sources
290303

291304
def _insufficient_evidence(
@@ -308,6 +321,30 @@ def _get_insufficient_text(self, query: str, n_docs: int) -> str:
308321
return _INSUFFICIENT_TEMPLATE_EN.format(n_docs=n_docs)
309322

310323

324+
def _truncate_at_sentence(text: str, max_chars: int) -> str:
325+
"""Truncate text at the last sentence boundary within max_chars.
326+
327+
Avoids cutting mid-word or mid-sentence, which can confuse the LLM.
328+
Handles Korean (다/요/음/임 endings) and English (./!/?).
329+
"""
330+
if len(text) <= max_chars:
331+
return text
332+
# Cut to max_chars, then find last sentence boundary
333+
cut = text[:max_chars]
334+
# Look for sentence-ending patterns (Korean & English)
335+
# Search backwards from the cut point
336+
last_boundary = -1
337+
for m in re.finditer(r"[.!?。]\s|[다요음임니까]\.\s|[다요음임니까]\s", cut):
338+
last_boundary = m.end()
339+
if last_boundary > max_chars * 0.3: # At least 30% of content preserved
340+
return cut[:last_boundary].rstrip() + "..."
341+
# Fallback: cut at last whitespace
342+
last_space = cut.rfind(" ")
343+
if last_space > max_chars * 0.3:
344+
return cut[:last_space].rstrip() + "..."
345+
return cut.rstrip() + "..."
346+
347+
311348
def _format_fact_block(facts: list[dict[str, Any]] | None) -> str:
312349
"""Format structured facts into a verified-data block for context injection.
313350

quantumrag/core/retrieve/fusion.py

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ def __init__(
4141
# LRU embedding cache — avoids redundant embedding calls for similar queries
4242
self._embed_cache: dict[str, list[float]] = {}
4343
self._cache_max = 32
44+
# Chunk→document mapping built during search for coherence boost
45+
self._chunk_doc_map: dict[str, str] = {}
4446

4547
async def search(
4648
self,
@@ -89,6 +91,17 @@ async def search(
8991
original_task, hype_task, bm25_task
9092
)
9193

94+
# Build chunk→document map from search results for coherence boost
95+
self._chunk_doc_map.clear()
96+
for result in original_results:
97+
doc_id = (result.metadata or {}).get("document_id", "")
98+
if doc_id:
99+
self._chunk_doc_map[result.id] = doc_id
100+
for result in bm25_results: # type: ignore[union-attr]
101+
doc_id = (getattr(result, "metadata", None) or {}).get("document_id", "")
102+
if doc_id:
103+
self._chunk_doc_map[result.id] = doc_id
104+
92105
# Map HyPE results back to chunk IDs
93106
hype_chunk_results = self._map_hype_to_chunks(hype_results)
94107

@@ -197,10 +210,25 @@ def _reciprocal_rank_fusion(
197210
raw_score = max(result.score, 0.0)
198211
scores[result.id] = scores.get(result.id, 0.0) + w_hype * raw_score / (k + rank + 1)
199212

213+
# Normalize BM25 scores to [0,1] using min-max within this result set
214+
# instead of capping at 1.0 (which loses signal discrimination)
215+
bm25_scores = [
216+
max(getattr(r, "score", 1.0), 0.0)
217+
for r in bm25 # type: ignore[union-attr]
218+
]
219+
bm25_max = max(bm25_scores) if bm25_scores else 1.0
220+
bm25_min = min(bm25_scores) if bm25_scores else 0.0
221+
bm25_range = bm25_max - bm25_min if bm25_max > bm25_min else 1.0
222+
200223
for rank, result in enumerate(bm25): # type: ignore[assignment]
201-
# BM25 scores are not normalized to [0,1], so cap at 1.0
202-
raw_score = min(max(result.score, 0.0), 1.0) if hasattr(result, "score") else 1.0
203-
scores[result.id] = scores.get(result.id, 0.0) + w_bm25 * raw_score / (k + rank + 1)
224+
raw = max(getattr(result, "score", 1.0), 0.0)
225+
# Min-max normalization preserves relative differences
226+
# When all scores are identical (or single result), use raw/max
227+
if bm25_max > bm25_min:
228+
normalized = (raw - bm25_min) / bm25_range
229+
else:
230+
normalized = min(raw / bm25_max, 1.0) if bm25_max > 0 else 1.0
231+
scores[result.id] = scores.get(result.id, 0.0) + w_bm25 * normalized / (k + rank + 1)
204232

205233
# Normalize to [0, 1]
206234
max_score = max(scores.values()) if scores else 1.0
@@ -217,29 +245,52 @@ def _apply_document_coherence_boost(
217245
) -> list[tuple[str, float]]:
218246
"""Boost scores when multiple chunks from the same document appear in results.
219247
220-
If a document has N chunks in the top candidates, each gets a boost
221-
proportional to N. This helps when the correct answer is spread across
222-
multiple chunks of the same source document.
248+
If a document has N chunks in the top candidates, each gets a small
249+
boost proportional to N. This helps when the correct answer is spread
250+
across multiple chunks of the same source document.
251+
252+
Uses chunk metadata from BM25 results (which carry document_id) and
253+
the vector store's stored metadata to map chunk_id → document_id.
254+
Falls back to a prefix heuristic when metadata is unavailable.
223255
"""
224-
# Count chunks per document in the top candidates (look at 3x top_k)
225-
candidate_pool = fused[: top_k * 3]
226-
doc_counts: dict[str, int] = {}
256+
if len(fused) < 2:
257+
return fused
258+
259+
candidate_pool = fused[: top_k * 4]
227260
chunk_to_doc: dict[str, str] = {}
228261

262+
# Try to resolve document_id via document_store metadata
229263
for chunk_id, _ in candidate_pool:
230-
# chunk_id contains document info — extract via document_store
231-
# Use a simple heuristic: chunk IDs from same ingest share a doc prefix
232-
# The actual document_id is stored in chunk metadata, but we don't have
233-
# it here. Instead, use the score pattern: if multiple chunks score well,
234-
# they likely share a document.
235-
doc_counts[chunk_id] = doc_counts.get(chunk_id, 0)
236-
237-
# Without document_id in the fusion result, we can't do document-level
238-
# grouping here. Instead, apply a simpler heuristic: if a chunk_id
239-
# appears in multiple indexes (original + hype + bm25), it gets a
240-
# natural boost from RRF already. The score-weighting above handles
241-
# the precision issue. Return unchanged.
242-
return fused
264+
# Use stored _chunk_doc_map if available (populated during search)
265+
doc_id = self._chunk_doc_map.get(chunk_id, "")
266+
if not doc_id:
267+
# Heuristic fallback: first 12 chars of chunk_id often share
268+
# a common prefix for chunks from the same document, but this
269+
# is unreliable. Skip boost for unknown chunks.
270+
continue
271+
chunk_to_doc[chunk_id] = doc_id
272+
273+
if not chunk_to_doc:
274+
return fused
275+
276+
# Count chunks per document in candidate pool
277+
doc_counts: dict[str, int] = {}
278+
for doc_id in chunk_to_doc.values():
279+
doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1
280+
281+
# Apply boost: +5% per additional chunk from same document (max +20%)
282+
boosted: list[tuple[str, float]] = []
283+
for chunk_id, score in fused:
284+
doc_id = chunk_to_doc.get(chunk_id, "")
285+
if doc_id and doc_counts.get(doc_id, 1) > 1:
286+
n_extra = min(doc_counts[doc_id] - 1, 4) # Cap at 4 extras
287+
boost = 1.0 + n_extra * 0.05
288+
boosted.append((chunk_id, score * boost))
289+
else:
290+
boosted.append((chunk_id, score))
291+
292+
boosted.sort(key=lambda x: x[1], reverse=True)
293+
return boosted
243294

244295

245296
class ScoredChunk:

quantumrag/core/retrieve/reranker.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,14 @@ async def rerank(
6060
request = RerankRequest(query=query, passages=passages)
6161
results = ranker.rerank(request)
6262

63-
# Map back to ScoredChunks with new scores
63+
# Blend reranker scores with original fusion scores (0.7/0.3)
64+
# to preserve signal from triple-index agreement
6465
reranked = []
6566
for result in results[:top_k]:
6667
idx = int(result["id"])
6768
original = chunks[idx]
68-
reranked.append(ScoredChunk(chunk=original.chunk, score=result["score"]))
69+
blended = 0.7 * result["score"] + 0.3 * original.score
70+
reranked.append(ScoredChunk(chunk=original.chunk, score=blended))
6971

7072
logger.debug("reranked", query_len=len(query), input=len(chunks), output=len(reranked))
7173
return reranked
@@ -122,7 +124,8 @@ async def rerank(
122124
reranked: list[ScoredChunk] = []
123125
for result in response.results:
124126
idx = result.index
125-
reranked.append(ScoredChunk(chunk=chunks[idx].chunk, score=result.relevance_score))
127+
blended = 0.7 * result.relevance_score + 0.3 * chunks[idx].score
128+
reranked.append(ScoredChunk(chunk=chunks[idx].chunk, score=blended))
126129

127130
logger.debug(
128131
"cohere_reranked", query_len=len(query), input=len(chunks), output=len(reranked)
@@ -184,10 +187,11 @@ async def rerank(
184187
reranked: list[ScoredChunk] = []
185188
for result in data["results"]:
186189
idx = result["index"]
190+
blended = 0.7 * result["relevance_score"] + 0.3 * chunks[idx].score
187191
reranked.append(
188192
ScoredChunk(
189193
chunk=chunks[idx].chunk,
190-
score=result["relevance_score"],
194+
score=blended,
191195
)
192196
)
193197

@@ -247,7 +251,10 @@ async def rerank(
247251

248252
indexed = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
249253
reranked = [
250-
ScoredChunk(chunk=chunks[idx].chunk, score=float(score))
254+
ScoredChunk(
255+
chunk=chunks[idx].chunk,
256+
score=0.7 * float(score) + 0.3 * chunks[idx].score,
257+
)
251258
for idx, score in indexed[:top_k]
252259
]
253260

tests/unit/test_rerankers_and_budget.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ async def test_rerank_calls_api_and_returns_reranked(self) -> None:
6464

6565
assert len(result) == 2
6666
assert result[0].chunk.id == "chunk-3"
67-
assert result[0].score == pytest.approx(0.95)
67+
# Score is blended: 0.7 * reranker_score + 0.3 * original_score
68+
# chunk-3 original=0.7, reranker=0.95 → 0.7*0.95 + 0.3*0.7 = 0.875
69+
assert result[0].score == pytest.approx(0.875)
6870
assert result[1].chunk.id == "chunk-1"
6971

7072
mock_client.rerank.assert_called_once()
@@ -134,7 +136,8 @@ async def test_rerank_calls_api_and_returns_reranked(self) -> None:
134136

135137
assert len(result) == 2
136138
assert result[0].chunk.id == "chunk-2"
137-
assert result[0].score == pytest.approx(0.99)
139+
# Blended: 0.7 * 0.99 + 0.3 * 0.8 = 0.933
140+
assert result[0].score == pytest.approx(0.933)
138141
assert result[1].chunk.id == "chunk-0"
139142

140143
@pytest.mark.asyncio

0 commit comments

Comments
 (0)