Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,40 @@ jobs:
# Unit tests run without a DB or API keys (DB/Voyage-dependent cases self-skip).
run: npm test

# Real Postgres (pgvector) so the DB/FTS slice of the suite actually runs,
# instead of self-skipping (ghost coverage). Not a required check yet: it
# exercises the storage layer (incl. the portuguese_unaccent FTS path that the
# jun/2026 diagnosis found broken and unguarded). Promote to required once stable.
db-test:
runs-on: ubuntu-latest
services:
postgres:
image: pgvector/pgvector:pg16
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: zinom_test
ports:
- 5432:5432
options: >-
--health-cmd "pg_isready -U postgres"
--health-interval 10s
--health-timeout 5s
--health-retries 5
env:
POSTGRES_URL: postgres://postgres:postgres@localhost:5432/zinom_test
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- name: Install
run: npm ci
- name: Migrate (creates pgvector/unaccent + portuguese_unaccent config)
run: npm run migrate
- name: DB-backed RAG tests (storage + FTS)
run: npx tsx --test src/rag/__tests__/storage.test.ts

web-build:
runs-on: ubuntu-latest
defaults:
Expand Down
20 changes: 13 additions & 7 deletions scripts/migrations/0002_hnsw_unaccent_pgtrgm.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,31 @@ BEGIN;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- 2. IMMUTABLE accent-insensitive TS config via dictionary mapping
-- 2. Drop the generated tsv column + its index FIRST. On a fresh DB, 0001 already
-- created BOTH the portuguese_unaccent config AND a tsv column that depends on
-- it, so the DROP CONFIGURATION below would fail with a dependency error if the
-- column still existed. Dropping it here makes a from-scratch migrate run work
-- (prod already recorded 0002 in schema_migrations and never re-runs it).
DROP INDEX IF EXISTS brain_chunks_tsv_idx;
ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv;

-- 3. IMMUTABLE accent-insensitive TS config via dictionary mapping
-- (NOT a direct unaccent() call, which is only STABLE).
DROP TEXT SEARCH CONFIGURATION IF EXISTS portuguese_unaccent;
CREATE TEXT SEARCH CONFIGURATION portuguese_unaccent ( COPY = portuguese );
ALTER TEXT SEARCH CONFIGURATION portuguese_unaccent
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, portuguese_stem;

-- 3. Swap ivfflat -> HNSW (cosine)
-- 4. Swap ivfflat -> HNSW (cosine)
DROP INDEX IF EXISTS brain_chunks_embedding_idx;
CREATE INDEX brain_chunks_embedding_idx
ON brain_chunks USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 200);

-- 4. Rebuild the generated tsv column to use portuguese_unaccent.
-- A generated column's expression CANNOT be altered in place, so drop &
-- re-add (this REWRITES the table; fine for the small corpus).
DROP INDEX IF EXISTS brain_chunks_tsv_idx;
ALTER TABLE brain_chunks DROP COLUMN IF EXISTS tsv;
-- 5. Re-add the generated tsv column using portuguese_unaccent.
-- A generated column's expression CANNOT be altered in place, so drop (above)
-- & re-add (this REWRITES the table; fine for the small corpus).
ALTER TABLE brain_chunks
ADD COLUMN tsv tsvector
GENERATED ALWAYS AS (to_tsvector('portuguese_unaccent', text)) STORED;
Expand Down
10 changes: 10 additions & 0 deletions src/__tests__/mcp-account-config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,13 @@ test("owner e friend instructions trazem a regra Zinom-first e os links", () =>
assert.match(s, /zinom_setup_tasks/);
}
});

test("owner e friend instructions ensinam brain_get_document p/ conteúdo íntegro (anti-gambiarra)", () => {
// Diagnóstico jun/2026: ambas as superfícies precisam advertir a tool de
// documento inteiro e proibir remontar via brain_search — senão o modelo
// recai na gambiarra de somar chunks.
for (const s of [OWNER_INSTRUCTIONS, FRIEND_INSTRUCTIONS]) {
assert.match(s, /brain_get_document/);
assert.match(s, /NUNCA reconstrua um documento somando resultados de brain_search/);
}
});
31 changes: 31 additions & 0 deletions src/rag/__tests__/storage.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,37 @@ test("searchKeyword uses the SAME ts config as the indexed tsv column (portugues
assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
});

test("searchKeyword finds a doc by a real PT-BR term against the unaccent tsv (regression: ervilha)", async () => {
// The bug that started the jun/2026 diagnosis: keyword 'ervilha' returned
// empty. Against a real Postgres, searchKeyword MUST find a doc whose text
// contains 'ervilha'. Runs only in the db-test CI job (real pgvector).
if (!HAS_PG) {
console.log("skipping: no POSTGRES_URL");
return;
}
const sid = `${TEST_PREFIX}-ervilha`;
await upsertChunks([
{
id: `${sid}-0`,
source_type: "notion",
source_id: sid,
workspace: "personal",
db_name: "Roteiros",
parent_url: "https://notion.so/ervilha",
chunk_index: 0,
text: "[Roteiros · personal] A Ervilha\n\nVocê conhece a história da Princesa e a Ervilha?",
embedding: fakeEmbed(7),
metadata: {},
source_updated: null,
},
]);
const hits = await searchKeyword("ervilha", undefined, 10);
assert.ok(
hits.some((h) => h.chunk.source_id === sid),
"searchKeyword('ervilha') deveria recuperar o doc indexado",
);
});

test("getDocumentChunks pins account_id, orders by chunk_index, and maps rows", async () => {
// Full-document fetch (powers brain_get_document). Multi-tenant: account_id is
// ALWAYS in the WHERE (never from input). Captures SQL via an injected pool.
Expand Down
Loading