Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/rag/__tests__/storage.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ after(async () => {
await closePool();
});

test("searchKeyword uses the SAME ts config as the indexed tsv column (portuguese_unaccent)", async () => {
// The tsv GENERATED column is to_tsvector('portuguese_unaccent', text)
// (migrations 0001/0002). The query MUST use the same config; using
// 'portuguese' silently loses accent-insensitive matching on the query side.
// Runs WITHOUT a DB by capturing the SQL via an injected pool.
test("searchKeyword queries with the eval-winning 'portuguese' config (NOT portuguese_unaccent)", async () => {
// Counter-intuitive but eval-backed: the query config is 'portuguese', which
// does NOT match the tsv column's 'portuguese_unaccent'. The matched config was
// tried and REGRESSED the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60).
// This test pins the decision so nobody "fixes" the mismatch without re-running
// `npm run eval`. Runs WITHOUT a DB by capturing the SQL via an injected pool.
let captured = "";
__setPoolForTest({
query: async (sql: unknown) => {
Expand All @@ -59,8 +60,8 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues
} finally {
__setPoolForTest(null);
}
assert.match(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/);
assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
assert.match(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
assert.doesNotMatch(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/);
});

test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => {
Expand Down
15 changes: 10 additions & 5 deletions src/rag/storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -567,17 +567,22 @@ export async function searchKeyword(
): Promise<{ chunk: Chunk; rank: number; score: number }[]> {
const p = getPool();
const filterClauses = buildFilterClauses(filters, 3);
// Query config MUST match the tsv GENERATED column config (portuguese_unaccent,
// migrations 0001/0002), otherwise accent-insensitive matching is lost here.
// Query config is 'portuguese' on purpose. It does NOT match the tsv column's
// 'portuguese_unaccent' config — but `npm run eval` proved the matched config
// REGRESSES ranking on the golden set (Recall@5 0.95 -> 0.83, MRR 0.73 -> 0.60):
// unaccenting the query broadens the keyword leg, and RRF+rerank then rank worse
// on this corpus. Accent-insensitive keyword search is a known limitation to
// revisit only WITH an eval that shows a net gain. Do not "fix" to
// portuguese_unaccent without re-running the eval. (jun/2026 diagnosis.)
const sql = `
SELECT
id, source_type, source_id, workspace, db_name, parent_url, chunk_index,
text, metadata, source_updated,
ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) AS score
ts_rank(tsv, plainto_tsquery('portuguese', $1)) AS score
FROM brain_chunks
WHERE tsv @@ plainto_tsquery('portuguese_unaccent', $1)
WHERE tsv @@ plainto_tsquery('portuguese', $1)
${filterClauses.sql}
ORDER BY ts_rank(tsv, plainto_tsquery('portuguese_unaccent', $1)) DESC
ORDER BY ts_rank(tsv, plainto_tsquery('portuguese', $1)) DESC
LIMIT $2
`;
const { rows } = await p.query<QueryRow>(sql, [
Expand Down
Loading