diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts index 4ed23be..7d175b0 100644 --- a/src/rag/__tests__/storage.test.ts +++ b/src/rag/__tests__/storage.test.ts @@ -42,11 +42,12 @@ after(async () => { await closePool(); }); -test("searchKeyword uses the SAME ts config as the indexed tsv column (portuguese_unaccent)", async () => { - // The tsv GENERATED column is to_tsvector('portuguese_unaccent', text) - // (migrations 0001/0002). The query MUST use the same config; using - // 'portuguese' silently loses accent-insensitive matching on the query side. - // Runs WITHOUT a DB by capturing the SQL via an injected pool. +test("searchKeyword queries with the eval-winning 'portuguese' config (NOT portuguese_unaccent)", async () => { + // Counter-intuitive but eval-backed: the query config is 'portuguese', which + // does NOT match the tsv column's 'portuguese_unaccent'. The matched config was + // tried and REGRESSED the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60). + // This test pins the decision so nobody "fixes" the mismatch without re-running + // `npm run eval`. Runs WITHOUT a DB by capturing the SQL via an injected pool. let captured = ""; __setPoolForTest({ query: async (sql: unknown) => { @@ -59,8 +60,8 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues } finally { __setPoolForTest(null); } - assert.match(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/); - assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/); + assert.match(captured, /plainto_tsquery\('portuguese',\s*\$1\)/); + assert.doesNotMatch(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/); }); test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => { diff --git a/src/rag/storage.ts b/src/rag/storage.ts index 99ffecc..78c5e87 100644 --- a/src/rag/storage.ts +++ b/src/rag/storage.ts @@ -567,17 +567,22 @@ export async function searchKeyword( ): Promise<{ chunk: Chunk; rank: number; score: number }[]> { const p = getPool(); const filterClauses = buildFilterClauses(filters, 3); - // Query config MUST match the tsv GENERATED column config (portuguese_unaccent, - // migrations 0001/0002), otherwise accent-insensitive matching is lost here. + // Query config is 'portuguese' on purpose. It does NOT match the tsv column's + // 'portuguese_unaccent' config — but `npm run eval` proved the matched config + // REGRESSES ranking on the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60): + // unaccenting the query broadens the keyword leg, and RRF+rerank then rank worse + // on this corpus. Accent-insensitive keyword search is a known limitation to + // revisit only WITH an eval that shows a net gain. Do not "fix" to + // portuguese_unaccent without re-running the eval. (jun/2026 diagnosis.) const sql = ` SELECT id, source_type, source_id, workspace, db_name, parent_url, chunk_index, text, metadata, source_updated, - ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) AS score + ts_rank(tsv, plainto_tsquery('portuguese', $1)) AS score FROM brain_chunks - WHERE tsv @@ plainto_tsquery('portuguese_unaccent', $1) + WHERE tsv @@ plainto_tsquery('portuguese', $1) ${filterClauses.sql} - ORDER BY ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) DESC + ORDER BY ts_rank(tsv, plainto_tsquery('portuguese', $1)) DESC LIMIT $2 `; const { rows } = await p.query(sql, [