From 87beda77d9d7bfd605e7911497b328866e811e88 Mon Sep 17 00:00:00 2001 From: bruno moniz Date: Thu, 18 Jun 2026 20:46:37 -0300 Subject: [PATCH 1/2] =?UTF-8?q?chore(ci):=20job=20db-test=20com=20Postgres?= =?UTF-8?q?=20real=20+=20regress=C3=A3o=20FTS=20e=20paridade=20da=20instru?= =?UTF-8?q?=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A suíte tinha 1248+ testes mas o CI rodava `npm test` SEM Postgres/Voyage: a fatia DB-dependente se auto-pulava (cobertura fantasma). Foi exatamente o que deixou o bug do FTS (config divergente) passar despercebido. - ci.yml: novo job `db-test` (não obrigatório ainda) com service pgvector:pg16 + `npm run migrate` (cria unaccent + a config portuguese_unaccent) rodando a suíte de storage contra Postgres real. Promover a obrigatório quando estável. - storage.test.ts: teste de regressão real `searchKeyword('ervilha')` recupera o doc (gateado por HAS_PG; roda no db-test). Guarda o "ervilha vazio". - mcp-account-config.test.ts: paridade — owner E friend advertem brain_get_document e proíbem remontar via brain_search. build+test verdes (1267/1267). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 34 ++++++++++++++++++++++++ src/__tests__/mcp-account-config.test.ts | 10 +++++++ src/rag/__tests__/storage.test.ts | 31 +++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1101a5d..9285f9c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,40 @@ jobs: # Unit tests run without a DB or API keys (DB/Voyage-dependent cases self-skip). run: npm test + # Real Postgres (pgvector) so the DB/FTS slice of the suite actually runs, + # instead of self-skipping (ghost coverage). Not a required check yet: it + # exercises the storage layer (incl. the portuguese_unaccent FTS path that the + # jun/2026 diagnosis found broken and unguarded). Promote to required once stable. + db-test: + runs-on: ubuntu-latest + services: + postgres: + image: pgvector/pgvector:pg16 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: zinom_test + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + POSTGRES_URL: postgres://postgres:postgres@localhost:5432/zinom_test + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: "22" + cache: "npm" + - name: Install + run: npm ci + - name: Migrate (creates pgvector/unaccent + portuguese_unaccent config) + run: npm run migrate + - name: DB-backed RAG tests (storage + FTS) + run: npx tsx --test src/rag/__tests__/storage.test.ts + web-build: runs-on: ubuntu-latest defaults: diff --git a/src/__tests__/mcp-account-config.test.ts b/src/__tests__/mcp-account-config.test.ts index 4c185f4..01ff1af 100644 --- a/src/__tests__/mcp-account-config.test.ts +++ b/src/__tests__/mcp-account-config.test.ts @@ -14,3 +14,13 @@ test("owner e friend instructions trazem a regra Zinom-first e os links", () => assert.match(s, /zinom_setup_tasks/); } }); + +test("owner e friend instructions ensinam brain_get_document p/ conteúdo íntegro (anti-gambiarra)", () => { + // Diagnóstico jun/2026: ambas as superfícies precisam advertir a tool de + // documento inteiro e proibir remontar via brain_search — senão o modelo + // recai na gambiarra de somar chunks. + for (const s of [OWNER_INSTRUCTIONS, FRIEND_INSTRUCTIONS]) { + assert.match(s, /brain_get_document/); + assert.match(s, /NUNCA reconstrua um documento somando resultados de brain_search/); + } +}); diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts index cb167f9..4ed23be 100644 --- a/src/rag/__tests__/storage.test.ts +++ b/src/rag/__tests__/storage.test.ts @@ -63,6 +63,37 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/); }); +test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => { + // The bug that started the jun/2026 diagnosis: keyword 'ervilha' returned + // empty. Against a real Postgres, searchKeyword MUST find a doc whose text + // contains 'ervilha'. Runs only in the db-test CI job (real pgvector). + if (!HAS_PG) { + console.log("skipping: no POSTGRES_URL"); + return; + } + const sid = `${TEST_PREFIX}-ervilha`; + await upsertChunks([ + { + id: `${sid}-0`, + source_type: "notion", + source_id: sid, + workspace: "personal", + db_name: "Roteiros", + parent_url: "https://notion.so/ervilha", + chunk_index: 0, + text: "[Roteiros · personal] A Ervilha\n\nVocê conhece a história da Princesa e a Ervilha?", + embedding: fakeEmbed(7), + metadata: {}, + source_updated: null, + }, + ]); + const hits = await searchKeyword("ervilha", undefined, 10); + assert.ok( + hits.some((h) => h.chunk.source_id === sid), + "searchKeyword('ervilha') deveria recuperar o doc indexado", + ); +}); + test("getDocumentChunks pins account_id, orders by chunk_index, and maps rows", async () => { // Full-document fetch (powers brain_get_document). Multi-tenant: account_id is // ALWAYS in the WHERE (never from input). Captures SQL via an injected pool. From fcbed6de65d02a5946f6d7ea5ff3b2eb6768ec79 Mon Sep 17 00:00:00 2001 From: bruno moniz Date: Thu, 18 Jun 2026 20:57:24 -0300 Subject: [PATCH 2/2] fix(migrations): 0002 roda do zero (dropa coluna tsv antes da config que ela depende) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O CI com Postgres revelou que o migrate do zero falhava: 0001 (versão atual) já cria a config portuguese_unaccent + a coluna tsv que depende dela; 0002 então tentava DROP da config com a coluna ainda existindo -> erro de dependência. Em prod nunca apareceu porque o runner rastreia schema_migrations e não re-roda 0002. Reordena: dropa índice+coluna tsv ANTES de dropar/recriar a config. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../migrations/0002_hnsw_unaccent_pgtrgm.sql | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql b/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql index 3d75d20..d9389e6 100644 --- a/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql +++ b/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql @@ -8,7 +8,15 @@ BEGIN; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pg_trgm; --- 2. IMMUTABLE accent-insensitive TS config via dictionary mapping +-- 2. Drop the generated tsv column + its index FIRST. On a fresh DB, 0001 already +-- created BOTH the portuguese_unaccent config AND a tsv column that depends on +-- it, so the DROP CONFIGURATION below would fail with a dependency error if the +-- column still existed. Dropping it here makes a from-scratch migrate run work +-- (prod already recorded 0002 in schema_migrations and never re-runs it). +DROP INDEX IF EXISTS brain_chunks_tsv_idx; +ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv; + +-- 3. IMMUTABLE accent-insensitive TS config via dictionary mapping -- (NOT a direct unaccent() call, which is only STABLE). DROP TEXT SEARCH CONFIGURATION IF EXISTS portuguese_unaccent; CREATE TEXT SEARCH CONFIGURATION portuguese_unaccent ( COPY = portuguese ); @@ -16,17 +24,15 @@ ALTER TEXT SEARCH CONFIGURATION portuguese_unaccent ALTER MAPPING FOR hword, hword_part, word WITH unaccent, portuguese_stem; --- 3. Swap ivfflat -> HNSW (cosine) +-- 4. Swap ivfflat -> HNSW (cosine) DROP INDEX IF EXISTS brain_chunks_embedding_idx; CREATE INDEX brain_chunks_embedding_idx ON brain_chunks USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 200); --- 4. Rebuild the generated tsv column to use portuguese_unaccent. --- A generated column's expression CANNOT be altered in place, so drop & --- re-add (this REWRITES the table; fine for the small corpus). -DROP INDEX IF EXISTS brain_chunks_tsv_idx; -ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv; +-- 5. Re-add the generated tsv column using portuguese_unaccent. +-- A generated column's expression CANNOT be altered in place, so drop (above) +-- & re-add (this REWRITES the table; fine for the small corpus). ALTER TABLE brain_chunks ADD COLUMN tsv tsvector GENERATED ALWAYS AS (to_tsvector('portuguese_unaccent', text)) STORED;