From 87beda77d9d7bfd605e7911497b328866e811e88 Mon Sep 17 00:00:00 2001
From: bruno moniz <brunoomoniz@gmail.com>
Date: Thu, 18 Jun 2026 20:46:37 -0300
Subject: [PATCH 1/2] =?UTF-8?q?chore(ci):=20job=20db-test=20com=20Postgres?=
 =?UTF-8?q?=20real=20+=20regress=C3=A3o=20FTS=20e=20paridade=20da=20instru?=
 =?UTF-8?q?=C3=A7=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A suíte tinha 1248+ testes mas o CI rodava `npm test` SEM Postgres/Voyage: a fatia
DB-dependente se auto-pulava (cobertura fantasma). Foi exatamente o que deixou o
bug do FTS (config divergente) passar despercebido.

- ci.yml: novo job `db-test` (não obrigatório ainda) com service pgvector:pg16 +
  `npm run migrate` (cria unaccent + a config portuguese_unaccent) rodando a suíte
  de storage contra Postgres real. Promover a obrigatório quando estável.
- storage.test.ts: teste de regressão real `searchKeyword('ervilha')` recupera o
  doc (gateado por HAS_PG; roda no db-test). Guarda o "ervilha vazio".
- mcp-account-config.test.ts: paridade — owner E friend advertem brain_get_document
  e proíbem remontar via brain_search.

build+test verdes (1267/1267).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml                 | 34 ++++++++++++++++++++++++
 src/__tests__/mcp-account-config.test.ts | 10 +++++++
 src/rag/__tests__/storage.test.ts        | 31 +++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1101a5d..9285f9c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,6 +23,40 @@ jobs:
         # Unit tests run without a DB or API keys (DB/Voyage-dependent cases self-skip).
         run: npm test
 
+  # Real Postgres (pgvector) so the DB/FTS slice of the suite actually runs,
+  # instead of self-skipping (ghost coverage). Not a required check yet: it
+  # exercises the storage layer (incl. the portuguese_unaccent FTS path that the
+  # jun/2026 diagnosis found broken and unguarded). Promote to required once stable.
+  db-test:
+    runs-on: ubuntu-latest
+    services:
+      postgres:
+        image: pgvector/pgvector:pg16
+        env:
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: zinom_test
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd "pg_isready -U postgres"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    env:
+      POSTGRES_URL: postgres://postgres:postgres@localhost:5432/zinom_test
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+      - name: Install
+        run: npm ci
+      - name: Migrate (creates pgvector/unaccent + portuguese_unaccent config)
+        run: npm run migrate
+      - name: DB-backed RAG tests (storage + FTS)
+        run: npx tsx --test src/rag/__tests__/storage.test.ts
+
   web-build:
     runs-on: ubuntu-latest
     defaults:
diff --git a/src/__tests__/mcp-account-config.test.ts b/src/__tests__/mcp-account-config.test.ts
index 4c185f4..01ff1af 100644
--- a/src/__tests__/mcp-account-config.test.ts
+++ b/src/__tests__/mcp-account-config.test.ts
@@ -14,3 +14,13 @@ test("owner e friend instructions trazem a regra Zinom-first e os links", () =>
     assert.match(s, /zinom_setup_tasks/);
   }
 });
+
+test("owner e friend instructions ensinam brain_get_document p/ conteúdo íntegro (anti-gambiarra)", () => {
+  // Diagnóstico jun/2026: ambas as superfícies precisam advertir a tool de
+  // documento inteiro e proibir remontar via brain_search — senão o modelo
+  // recai na gambiarra de somar chunks.
+  for (const s of [OWNER_INSTRUCTIONS, FRIEND_INSTRUCTIONS]) {
+    assert.match(s, /brain_get_document/);
+    assert.match(s, /NUNCA reconstrua um documento somando resultados de brain_search/);
+  }
+});
diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts
index cb167f9..4ed23be 100644
--- a/src/rag/__tests__/storage.test.ts
+++ b/src/rag/__tests__/storage.test.ts
@@ -63,6 +63,37 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues
   assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
 });
 
+test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => {
+  // The bug that started the jun/2026 diagnosis: keyword 'ervilha' returned
+  // empty. Against a real Postgres, searchKeyword MUST find a doc whose text
+  // contains 'ervilha'. Runs only in the db-test CI job (real pgvector).
+  if (!HAS_PG) {
+    console.log("skipping: no POSTGRES_URL");
+    return;
+  }
+  const sid = `${TEST_PREFIX}-ervilha`;
+  await upsertChunks([
+    {
+      id: `${sid}-0`,
+      source_type: "notion",
+      source_id: sid,
+      workspace: "personal",
+      db_name: "Roteiros",
+      parent_url: "https://notion.so/ervilha",
+      chunk_index: 0,
+      text: "[Roteiros · personal] A Ervilha\n\nVocê conhece a história da Princesa e a Ervilha?",
+      embedding: fakeEmbed(7),
+      metadata: {},
+      source_updated: null,
+    },
+  ]);
+  const hits = await searchKeyword("ervilha", undefined, 10);
+  assert.ok(
+    hits.some((h) => h.chunk.source_id === sid),
+    "searchKeyword('ervilha') deveria recuperar o doc indexado",
+  );
+});
+
 test("getDocumentChunks pins account_id, orders by chunk_index, and maps rows", async () => {
   // Full-document fetch (powers brain_get_document). Multi-tenant: account_id is
   // ALWAYS in the WHERE (never from input). Captures SQL via an injected pool.

From fcbed6de65d02a5946f6d7ea5ff3b2eb6768ec79 Mon Sep 17 00:00:00 2001
From: bruno moniz <brunoomoniz@gmail.com>
Date: Thu, 18 Jun 2026 20:57:24 -0300
Subject: [PATCH 2/2] fix(migrations): 0002 roda do zero (dropa coluna tsv
 antes da config que ela depende)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

O CI com Postgres revelou que o migrate do zero falhava: 0001 (versão atual) já
cria a config portuguese_unaccent + a coluna tsv que depende dela; 0002 então
tentava DROP da config com a coluna ainda existindo -> erro de dependência. Em
prod nunca apareceu porque o runner rastreia schema_migrations e não re-roda 0002.
Reordena: dropa índice+coluna tsv ANTES de dropar/recriar a config.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../migrations/0002_hnsw_unaccent_pgtrgm.sql  | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql b/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql
index 3d75d20..d9389e6 100644
--- a/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql
+++ b/scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql
@@ -8,7 +8,15 @@ BEGIN;
 CREATE EXTENSION IF NOT EXISTS unaccent;
 CREATE EXTENSION IF NOT EXISTS pg_trgm;
 
--- 2. IMMUTABLE accent-insensitive TS config via dictionary mapping
+-- 2. Drop the generated tsv column + its index FIRST. On a fresh DB, 0001 already
+--    created BOTH the portuguese_unaccent config AND a tsv column that depends on
+--    it, so the DROP CONFIGURATION below would fail with a dependency error if the
+--    column still existed. Dropping it here makes a from-scratch migrate run work
+--    (prod already recorded 0002 in schema_migrations and never re-runs it).
+DROP INDEX IF EXISTS brain_chunks_tsv_idx;
+ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv;
+
+-- 3. IMMUTABLE accent-insensitive TS config via dictionary mapping
 --    (NOT a direct unaccent() call, which is only STABLE).
 DROP TEXT SEARCH CONFIGURATION IF EXISTS portuguese_unaccent;
 CREATE TEXT SEARCH CONFIGURATION portuguese_unaccent ( COPY = portuguese );
@@ -16,17 +24,15 @@ ALTER TEXT SEARCH CONFIGURATION portuguese_unaccent
   ALTER MAPPING FOR hword, hword_part, word
   WITH unaccent, portuguese_stem;
 
--- 3. Swap ivfflat -> HNSW (cosine)
+-- 4. Swap ivfflat -> HNSW (cosine)
 DROP INDEX IF EXISTS brain_chunks_embedding_idx;
 CREATE INDEX brain_chunks_embedding_idx
   ON brain_chunks USING hnsw (embedding vector_cosine_ops)
   WITH (m = 16, ef_construction = 200);
 
--- 4. Rebuild the generated tsv column to use portuguese_unaccent.
---    A generated column's expression CANNOT be altered in place, so drop &
---    re-add (this REWRITES the table; fine for the small corpus).
-DROP INDEX IF EXISTS brain_chunks_tsv_idx;
-ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv;
+-- 5. Re-add the generated tsv column using portuguese_unaccent.
+--    A generated column's expression CANNOT be altered in place, so drop (above)
+--    & re-add (this REWRITES the table; fine for the small corpus).
 ALTER TABLE brain_chunks
   ADD COLUMN tsv tsvector
   GENERATED ALWAYS AS (to_tsvector('portuguese_unaccent', text)) STORED;