From 09afb7ff57be39834c7b35a4c60bd0f44904640c Mon Sep 17 00:00:00 2001
From: bruno moniz <brunoomoniz@gmail.com>
Date: Thu, 18 Jun 2026 21:04:40 -0300
Subject: [PATCH] =?UTF-8?q?fix(rag):=20reverte=20config=20FTS=20p/=20'port?=
 =?UTF-8?q?uguese'=20=E2=80=94=20eval=20provou=20regress=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

O PR #145 casou a config do query keyword com o índice (portuguese_unaccent),
teoricamente correto. Mas `npm run eval` na VPS mostrou REGRESSÃO no golden set:
Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60. Unaccentuar o query alarga o leg keyword
e, via RRF + rerank, piora o ranking neste corpus (o "mismatch" era load-bearing).

Volta as 3 ocorrências para plainto_tsquery('portuguese', $1) e fixa a decisão num
teste + comentário (não re-"corrigir" sem rodar o eval). O fix nunca foi necessário
para o caso 'ervilha' (palavra ASCII casa nas duas configs); a causa real do
'ervilha vazio' era o header ausente no re-index, já corrigida e mantida.

build+test verdes (1267/1267).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/rag/__tests__/storage.test.ts | 15 ++++++++-------
 src/rag/storage.ts                | 15 ++++++++++-----
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts
index 4ed23be..7d175b0 100644
--- a/src/rag/__tests__/storage.test.ts
+++ b/src/rag/__tests__/storage.test.ts
@@ -42,11 +42,12 @@ after(async () => {
   await closePool();
 });
 
-test("searchKeyword uses the SAME ts config as the indexed tsv column (portuguese_unaccent)", async () => {
-  // The tsv GENERATED column is to_tsvector('portuguese_unaccent', text)
-  // (migrations 0001/0002). The query MUST use the same config; using
-  // 'portuguese' silently loses accent-insensitive matching on the query side.
-  // Runs WITHOUT a DB by capturing the SQL via an injected pool.
+test("searchKeyword queries with the eval-winning 'portuguese' config (NOT portuguese_unaccent)", async () => {
+  // Counter-intuitive but eval-backed: the query config is 'portuguese', which
+  // does NOT match the tsv column's 'portuguese_unaccent'. The matched config was
+  // tried and REGRESSED the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60).
+  // This test pins the decision so nobody "fixes" the mismatch without re-running
+  // `npm run eval`. Runs WITHOUT a DB by capturing the SQL via an injected pool.
   let captured = "";
   __setPoolForTest({
     query: async (sql: unknown) => {
@@ -59,8 +60,8 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues
   } finally {
     __setPoolForTest(null);
   }
-  assert.match(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/);
-  assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
+  assert.match(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
+  assert.doesNotMatch(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/);
 });
 
 test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => {
diff --git a/src/rag/storage.ts b/src/rag/storage.ts
index 99ffecc..78c5e87 100644
--- a/src/rag/storage.ts
+++ b/src/rag/storage.ts
@@ -567,17 +567,22 @@ export async function searchKeyword(
 ): Promise<{ chunk: Chunk; rank: number; score: number }[]> {
   const p = getPool();
   const filterClauses = buildFilterClauses(filters, 3);
-  // Query config MUST match the tsv GENERATED column config (portuguese_unaccent,
-  // migrations 0001/0002), otherwise accent-insensitive matching is lost here.
+  // Query config is 'portuguese' on purpose. It does NOT match the tsv column's
+  // 'portuguese_unaccent' config — but `npm run eval` proved the matched config
+  // REGRESSES ranking on the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60):
+  // unaccenting the query broadens the keyword leg, and RRF+rerank then rank worse
+  // on this corpus. Accent-insensitive keyword search is a known limitation to
+  // revisit only WITH an eval that shows a net gain. Do not "fix" to
+  // portuguese_unaccent without re-running the eval. (jun/2026 diagnosis.)
   const sql = `
     SELECT
       id, source_type, source_id, workspace, db_name, parent_url, chunk_index,
       text, metadata, source_updated,
-      ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) AS score
+      ts_rank(tsv, plainto_tsquery('portuguese', $1)) AS score
     FROM brain_chunks
-    WHERE tsv @@ plainto_tsquery('portuguese_unaccent', $1)
+    WHERE tsv @@ plainto_tsquery('portuguese', $1)
       ${filterClauses.sql}
-    ORDER BY ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) DESC
+    ORDER BY ts_rank(tsv, plainto_tsquery('portuguese', $1)) DESC
     LIMIT $2
   `;
   const { rows } = await p.query<QueryRow>(sql, [