From 09afb7ff57be39834c7b35a4c60bd0f44904640c Mon Sep 17 00:00:00 2001 From: bruno moniz Date: Thu, 18 Jun 2026 21:04:40 -0300 Subject: [PATCH] =?UTF-8?q?fix(rag):=20reverte=20config=20FTS=20p/=20'port?= =?UTF-8?q?uguese'=20=E2=80=94=20eval=20provou=20regress=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O PR #145 casou a config do query keyword com o índice (portuguese_unaccent), teoricamente correto. Mas `npm run eval` na VPS mostrou REGRESSÃO no golden set: Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60. Unaccentuar o query alarga o leg keyword e, via RRF + rerank, piora o ranking neste corpus (o "mismatch" era load-bearing). Volta as 3 ocorrências para plainto_tsquery('portuguese', $1) e fixa a decisão num teste + comentário (não re-"corrigir" sem rodar o eval). O fix nunca foi necessário para o caso 'ervilha' (palavra ASCII casa nas duas configs); a causa real do 'ervilha vazio' era o header ausente no re-index, já corrigida e mantida. build+test verdes (1267/1267). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/rag/__tests__/storage.test.ts | 15 ++++++++------- src/rag/storage.ts | 15 ++++++++++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts index 4ed23be..7d175b0 100644 --- a/src/rag/__tests__/storage.test.ts +++ b/src/rag/__tests__/storage.test.ts @@ -42,11 +42,12 @@ after(async () => { await closePool(); }); -test("searchKeyword uses the SAME ts config as the indexed tsv column (portuguese_unaccent)", async () => { - // The tsv GENERATED column is to_tsvector('portuguese_unaccent', text) - // (migrations 0001/0002). The query MUST use the same config; using - // 'portuguese' silently loses accent-insensitive matching on the query side. - // Runs WITHOUT a DB by capturing the SQL via an injected pool. +test("searchKeyword queries with the eval-winning 'portuguese' config (NOT portuguese_unaccent)", async () => { + // Counter-intuitive but eval-backed: the query config is 'portuguese', which + // does NOT match the tsv column's 'portuguese_unaccent'. The matched config was + // tried and REGRESSED the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60). + // This test pins the decision so nobody "fixes" the mismatch without re-running + // `npm run eval`. Runs WITHOUT a DB by capturing the SQL via an injected pool. let captured = ""; __setPoolForTest({ query: async (sql: unknown) => { @@ -59,8 +60,8 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues } finally { __setPoolForTest(null); } - assert.match(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/); - assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/); + assert.match(captured, /plainto_tsquery\('portuguese',\s*\$1\)/); + assert.doesNotMatch(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/); }); test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => { diff --git a/src/rag/storage.ts b/src/rag/storage.ts index 99ffecc..78c5e87 100644 --- a/src/rag/storage.ts +++ b/src/rag/storage.ts @@ -567,17 +567,22 @@ export async function searchKeyword( ): Promise<{ chunk: Chunk; rank: number; score: number }[]> { const p = getPool(); const filterClauses = buildFilterClauses(filters, 3); - // Query config MUST match the tsv GENERATED column config (portuguese_unaccent, - // migrations 0001/0002), otherwise accent-insensitive matching is lost here. + // Query config is 'portuguese' on purpose. It does NOT match the tsv column's + // 'portuguese_unaccent' config — but `npm run eval` proved the matched config + // REGRESSES ranking on the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60): + // unaccenting the query broadens the keyword leg, and RRF+rerank then rank worse + // on this corpus. Accent-insensitive keyword search is a known limitation to + // revisit only WITH an eval that shows a net gain. Do not "fix" to + // portuguese_unaccent without re-running the eval. (jun/2026 diagnosis.) const sql = ` SELECT id, source_type, source_id, workspace, db_name, parent_url, chunk_index, text, metadata, source_updated, - ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) AS score + ts_rank(tsv, plainto_tsquery('portuguese', $1)) AS score FROM brain_chunks - WHERE tsv @@ plainto_tsquery('portuguese_unaccent', $1) + WHERE tsv @@ plainto_tsquery('portuguese', $1) ${filterClauses.sql} - ORDER BY ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) DESC + ORDER BY ts_rank(tsv, plainto_tsquery('portuguese', $1)) DESC LIMIT $2 `; const { rows } = await p.query(sql, [