BrunooMoniz · BrunooMoniz · Jun 18, 2026 · Jun 18, 2026
diff --git a/src/index.ts b/src/index.ts
@@ -18,6 +18,7 @@ import { registerBrainStatusTool, setReindexInFlightSet } from "./rag/brain-stat
 import { registerBrainReindexTool, setReindexSet } from "./rag/brain-reindex-tool.js";
 import { registerBrainTodayTool } from "./rag/brain-today-tool.js";
 import { registerBrainListDocumentsTool } from "./rag/brain-list-documents-tool.js";
+import { registerBrainGetDocumentTool } from "./rag/brain-get-document-tool.js";
 import { registerBrainFeedbackTool } from "./rag/brain-feedback-tool.js";
 import { createOAuthRouter, getAccessTokenInfo } from "./oauth.js";
 import { createGoogleRouter } from "./google/routes.js";
@@ -467,6 +468,7 @@ app.post("/mcp", async (req, res) => {
     registerBrainReindexTool(server);
     registerBrainTodayTool(server);
     registerBrainListDocumentsTool(server);
+    registerBrainGetDocumentTool(server);
     registerBrainFeedbackTool(server);
   } else {
     registerBrainSearchTool(server);
@@ -481,6 +483,7 @@ app.post("/mcp", async (req, res) => {
     registerBrainReindexTool(server);
     registerBrainTodayTool(server);
     registerBrainListDocumentsTool(server);
+    registerBrainGetDocumentTool(server);
     registerBrainFeedbackTool(server);
   }
   await server.connect(transport);

diff --git a/src/mcp-account-config.ts b/src/mcp-account-config.ts
@@ -136,6 +136,7 @@ You have access to a Notion MCP server that manages three separate workspaces. E
 - **brain_reindex** — Dispara reindexação assíncrona de todas as fontes. Use quando brain_status mostrar fontes stale/com erro, ou o usuário pedir "indexar agora".
 - **brain_today** — Retorna os eventos do dia, contexto do cérebro para cada reunião, e as tarefas abertas prioritárias. Use para "agenda do dia", "briefing de hoje".
 - **brain_list_documents** — Lista documentos indexados (um por source_id). Filtros: source_type, q (substring). Não usa quota de busca.
+- **brain_get_document** — Retorna o TEXTO COMPLETO de um documento já indexado, remontado de todos os chunks na ordem original. Use SEMPRE que precisar reproduzir, transformar, resumir fielmente ou editar um documento CONHECIDO por inteiro (ex.: "pegue meu roteiro X no Notion e crie um vídeo"). brain_search amostra por relevância e devolve fragmentos: **NUNCA reconstrua um documento somando resultados de brain_search — use brain_get_document.** Informe source_url (URL do Notion) ou source_id (de brain_list_documents). Não usa quota de busca.
 
 ## Calendário
 
@@ -185,6 +186,7 @@ Ferramentas disponíveis:
 - **brain_reindex** — dispara a reindexação do cérebro em segundo plano. Use quando brain_status mostrar problemas ou a pessoa pedir "atualiza agora", "indexar agora". Avise que o processo roda em segundo plano e pode levar alguns minutos.
 - **brain_today** — retorna os eventos do dia, contexto do cérebro para cada reunião e tarefas abertas. Use para "agenda de hoje", "o que tenho hoje?".
 - **brain_list_documents** — lista documentos indexados no cérebro (um por source_id). Filtros: source_type, q. Não usa quota de busca.
+- **brain_get_document** — devolve o TEXTO COMPLETO de um documento já indexado, remontado de todos os chunks na ordem original. Use SEMPRE que precisar reproduzir, transformar, resumir fielmente ou editar um documento CONHECIDO por inteiro (ex.: "pegue meu roteiro X no Notion e cria um vídeo"). brain_search amostra por relevância e devolve fragmentos: **NUNCA reconstrua um documento somando resultados de brain_search — use brain_get_document.** Informe source_url (a URL do Notion) ou source_id (de brain_list_documents). Não usa quota de busca.
 - **list_calendars** / **list_events** — vê as agendas e os eventos das contas Google que a pessoa conectou no portal.
 - **create_calendar_event** / **update_calendar_event** / **delete_calendar_event** — cria, edita e exclui eventos diretamente na agenda do Google da pessoa. Sempre confirme antes de excluir; delete_calendar_event só executa com confirm=true. Converta "amanhã 15h" em ISO 8601 absoluto usando a data atual.
 - **rubrix_send_document** — envia um documento para assinatura digital pela Rubrix. NÃO recebe o arquivo: devolve um **upload_url** seguro que a pessoa abre no navegador e onde solta o PDF; só após o upload o fluxo é disparado e os signatários são notificados. Colete email + CPF/CNPJ de cada signatário antes de chamar e mostre o upload_url. Requer a Rubrix conectada no portal. Acompanhe com **rubrix_check_status** (passe o id) e **rubrix_list_documents**; quando SIGNED, entregue o download_url.

diff --git a/src/rag/__tests__/brain-get-document-tool.test.ts b/src/rag/__tests__/brain-get-document-tool.test.ts
@@ -0,0 +1,77 @@
+// src/rag/__tests__/brain-get-document-tool.test.ts
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { handleBrainGetDocument } from "../brain-get-document-tool.js";
+import type { Chunk } from "../types.js";
+
+const HEADER = "[Roteiros · nora · 2026-06-10] A Ervilha";
+
+function chunk(idx: number, text: string): Chunk {
+  return {
+    id: `page-1-${idx}`,
+    source_type: "notion",
+    source_id: "page-1",
+    workspace: "nora",
+    db_name: "Roteiros",
+    parent_url: "https://notion.so/page-1",
+    chunk_index: idx,
+    text,
+    metadata: {},
+    source_updated: null,
+  };
+}
+
+function deps(chunks: Chunk[], captured?: { opts?: unknown }) {
+  return {
+    getDocumentChunks: async (opts: unknown) => {
+      if (captured) captured.opts = opts;
+      return chunks;
+    },
+    getAllowedWorkspaces: () => null,
+  };
+}
+
+test("handleBrainGetDocument stitches all chunks into one faithful document", async () => {
+  const chunks = [
+    chunk(0, `${HEADER}\n\n# A Ervilha\n\n## CENA 1\n\nabertura.`),
+    chunk(1, `${HEADER}\n\n## CENA 2\n\nfim.`),
+  ];
+  const res = await handleBrainGetDocument("acct-1", { source_url: "https://app.notion.com/p/A-Ervilha-32607ba5bee88138a5a5d662c9b1f4d2" }, deps(chunks));
+  assert.equal(res.ok, true);
+  assert.equal(res.chunk_count, 2);
+  assert.equal(res.source_type, "notion");
+  assert.equal(res.title, "A Ervilha");
+  assert.ok(res.full_text.includes("## CENA 1"), "deve trazer CENA 1");
+  assert.ok(res.full_text.includes("## CENA 2"), "deve trazer CENA 2");
+  assert.equal(res.full_text.includes(HEADER), false, "header repetido não deve sobrar");
+});
+
+test("handleBrainGetDocument fences untrusted notion content against prompt-injection", async () => {
+  const chunks = [chunk(0, `${HEADER}\n\nconteudo do notion.`)];
+  const res = await handleBrainGetDocument("acct-1", { source_id: "page-1" }, deps(chunks));
+  assert.ok(res.full_text.includes("<<<untrusted>>>"), "conteúdo notion deve ir cercado");
+});
+
+test("handleBrainGetDocument resolves a Notion URL to a source_id", async () => {
+  const captured: { opts?: any } = {};
+  const chunks = [chunk(0, `${HEADER}\n\nx`)];
+  await handleBrainGetDocument(
+    "acct-1",
+    { source_url: "https://app.notion.com/p/A-Ervilha-32607ba5bee88138a5a5d662c9b1f4d2" },
+    deps(chunks, captured),
+  );
+  assert.equal(captured.opts.sourceId, "32607ba5-bee8-8138-a5a5-d662c9b1f4d2");
+  assert.equal(captured.opts.accountId, "acct-1");
+});
+
+test("handleBrainGetDocument returns not_found when nothing is indexed", async () => {
+  const res = await handleBrainGetDocument("acct-1", { source_id: "missing" }, deps([]));
+  assert.equal(res.ok, false);
+  assert.equal(res.error, "not_found");
+});
+
+test("handleBrainGetDocument requires source_id or source_url", async () => {
+  const res = await handleBrainGetDocument("acct-1", {}, deps([]));
+  assert.equal(res.ok, false);
+  assert.equal(res.error, "missing_argument");
+});
diff --git a/src/rag/__tests__/brain-index-url-tool.test.ts b/src/rag/__tests__/brain-index-url-tool.test.ts
@@ -15,9 +15,8 @@ process.env.NOTION_PERSONAL_TOKEN ??= "ntn_test_stub";
 process.env.NOTION_NORA_TOKEN ??= "ntn_test_stub";
 process.env.OAUTH_PASSWORD_HASH ??= "stub-hash";
 
-const { registerBrainIndexUrlTool, buildFriendWorkspaceParam } = await import(
-  "../brain-index-url-tool.js"
-);
+const { registerBrainIndexUrlTool, buildFriendWorkspaceParam, pageToIndexableDocument } =
+  await import("../brain-index-url-tool.js");
 const { requestContext } = await import("../../context.js");
 
 // ---------- helpers ---------------------------------------------------------
@@ -180,3 +179,33 @@ test("handler mantém o gate assertWorkspaceScope no write", async () => {
     /Access denied/,
   );
 });
+
+// Diagnóstico jun/2026: o re-index on-demand chamava chunkText cru, sem o
+// context header que o indexer agendado prepende — chunks re-indexados perdiam
+// o título do documento (causa do "ervilha vazio"). pageToIndexableDocument
+// monta o MESMO IndexableDocument do indexer agendado, então indexSinglePage
+// passa a delegar a indexDocument (header idêntico em todo chunk).
+test("pageToIndexableDocument builds the same IndexableDocument shape as the scheduled indexer", () => {
+  const page = {
+    id: "32607ba5-bee8-8138-a5a5-d662c9b1f4d2",
+    url: "https://www.notion.so/A-Ervilha-32607ba5bee88138a5a5d662c9b1f4d2",
+    last_edited_time: "2026-06-10T12:00:00.000Z",
+    properties: {},
+  };
+  const doc = pageToIndexableDocument(
+    page,
+    "nora",
+    "Roteiros",
+    "# A Ervilha\n\n## Roteiro\n\nconteudo",
+    "acct-1",
+  );
+  assert.equal(doc.source_type, "notion");
+  assert.equal(doc.source_id, page.id);
+  assert.equal(doc.workspace, "nora");
+  assert.equal(doc.db_name, "Roteiros");
+  assert.equal(doc.parent_url, page.url);
+  assert.equal(doc.account_id, "acct-1");
+  assert.equal(doc.text, "# A Ervilha\n\n## Roteiro\n\nconteudo");
+  assert.ok(doc.source_updated instanceof Date);
+  assert.equal(doc.source_updated.toISOString(), "2026-06-10T12:00:00.000Z");
+});
diff --git a/src/rag/__tests__/chunker.test.ts b/src/rag/__tests__/chunker.test.ts
@@ -29,10 +29,32 @@ test("chunkText applies overlap between chunks", () => {
   assert.ok(out[1].includes(tail.split(" ")[2]) || out[1].length > 0);
 });
 
-test("chunkText breaks at headings", () => {
+test("chunkText coalesces small adjacent heading sections into one chunk", () => {
+  // Behavior change (diagnóstico jun/2026): a structured doc (roteiro) with many
+  // short headings used to explode into one tiny chunk PER heading, making a page
+  // unrecoverable via search. Small adjacent sections must now merge up to target.
   const text = "intro paragrafo.\n\n## Heading 1\n\nconteudo.\n\n## Heading 2\n\nmais conteudo.";
   const out = chunkText(text, { targetTokens: 1000 });
-  assert.equal(out.length, 3);
-  assert.ok(out[1].startsWith("## Heading 1"));
-  assert.ok(out[2].startsWith("## Heading 2"));
+  assert.equal(out.length, 1, "três seções minúsculas devem coalescer em 1 chunk");
+  assert.ok(out[0].includes("## Heading 1"));
+  assert.ok(out[0].includes("## Heading 2"));
+});
+
+test("chunkText coalesces a roteiro of many short CENA headings into few chunks", () => {
+  // 14 short '### CENA N' sections (~the A Ervilha case). Must NOT become 14 chunks.
+  const cenas = Array.from(
+    { length: 14 },
+    (_, i) => `### CENA ${i + 1}\n\numa fala curta da cena numero ${i + 1} aqui.`,
+  ).join("\n\n");
+  const out = chunkText(cenas, { targetTokens: 500 });
+  assert.ok(out.length < 14, `esperava << 14 chunks, veio ${out.length}`);
+  assert.ok(out.length <= 4, `coalescência insuficiente: ${out.length} chunks`);
+});
+
+test("chunkText still breaks a section that exceeds target", () => {
+  // A single heading section larger than target must still split (no giant chunk).
+  const big = Array.from({ length: 40 }, (_, i) => `paragrafo ${i} com bastante conteudo textual para encher.`).join("\n\n");
+  const text = `## Grande\n\n${big}`;
+  const out = chunkText(text, { targetTokens: 50, maxTokens: 80 });
+  assert.ok(out.length >= 2, "seção grande deve quebrar em vários chunks");
 });
diff --git a/src/rag/__tests__/search.test.ts b/src/rag/__tests__/search.test.ts
@@ -278,3 +278,43 @@ test("brainSearch ignores a caller-supplied _accountId (set server-side only)",
   await brainSearch("q", { topK: 3, filters: { _accountId: "acme" } as never });
   assert.equal(semFilters?._accountId, "bruno");
 });
+
+// --- Regression anchor: brain_search é AMOSTRAGEM, não recuperação de doc ----
+const mkDoc = (idx: number): Chunk => ({
+  id: `ervilha-${idx}`,
+  source_type: "notion",
+  source_id: "doc-ervilha",
+  workspace: "personal",
+  db_name: "Roteiros",
+  parent_url: "https://notion.so/ervilha",
+  chunk_index: idx,
+  text: `CENA ${idx}`,
+  metadata: {},
+  source_updated: null,
+});
+
+test("brain_search NÃO recupera um documento inteiro: no máx maxPerUrl chunks por página (use brain_get_document)", async () => {
+  // Âncora de regressão da gambiarra do 'A Ervilha'. Uma página de 14 chunks NÃO
+  // é recuperável inteira via brain_search POR DESIGN (dedup + diversify maxPerUrl).
+  // Invariante consciente: para o documento inteiro existe brain_get_document.
+  // Se este teste mudar, foi decisão de design deliberada — não um acidente.
+  const ranked = Array.from({ length: 14 }, (_, i) => ({
+    chunk: mkDoc(i),
+    rank: i + 1,
+    score: 1 - i * 0.01,
+  }));
+  __setSearchDepsForTest({
+    searchSemantic: async () => ranked,
+    searchKeyword: async () => ranked,
+    embedQuery: async () => [0.1, 0.2],
+    rerankDocuments: async () => [],
+    getAllowedWorkspaces: () => null,
+  });
+  const out = await brainSearch("ervilha", { topK: 12, rerank: false });
+  const fromDoc = out.filter((h) => h.chunk.source_id === "doc-ervilha");
+  assert.ok(
+    fromDoc.length <= 3,
+    `brain_search devolveu ${fromDoc.length} chunks do doc; esperado <= maxPerUrl (3)`,
+  );
+  assert.ok(fromDoc.length < 14, "jamais recupera os 14 chunks — por isso existe brain_get_document");
+});
diff --git a/src/rag/__tests__/stitch-document.test.ts b/src/rag/__tests__/stitch-document.test.ts
@@ -0,0 +1,48 @@
+// src/rag/__tests__/stitch-document.test.ts
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { stitchDocument } from "../stitch-document.js";
+
+const HEADER = "[Roteiros · nora · 2026-06-10] A Ervilha — vlog Nora Finance";
+
+test("stitchDocument returns '' for no chunks", () => {
+  assert.equal(stitchDocument([]), "");
+});
+
+test("stitchDocument returns a single chunk trimmed", () => {
+  assert.equal(stitchDocument(["  só um chunk pequeno.  "]), "só um chunk pequeno.");
+});
+
+test("stitchDocument strips the repeated provenance header shared by every chunk", () => {
+  const chunks = [
+    `${HEADER}\n\n# A Ervilha\n\n## CENA 1\n\nabertura.`,
+    `${HEADER}\n\n## CENA 2\n\ndesenvolvimento.`,
+  ];
+  const out = stitchDocument(chunks);
+  // The header line must appear ZERO times in the body (it was repeated noise).
+  assert.equal(out.includes(HEADER), false, "header repetido não deve sobrar no corpo");
+  assert.ok(out.includes("# A Ervilha"));
+  assert.ok(out.includes("## CENA 1"));
+  assert.ok(out.includes("## CENA 2"));
+});
+
+test("stitchDocument removes the overlap the chunker repeats between consecutive chunks", () => {
+  // Chunk B starts with the trailing paragraph of chunk A (chunker overlap).
+  const overlap = "este paragrafo se repete na borda entre os dois chunks.";
+  const a = `${HEADER}\n\nparagrafo inicial do documento.\n\n${overlap}`;
+  const b = `${HEADER}\n\n${overlap}\n\nparagrafo final do documento.`;
+  const out = stitchDocument([a, b]);
+  // The overlap text must appear exactly once after stitching.
+  const occurrences = out.split(overlap).length - 1;
+  assert.equal(occurrences, 1, `overlap deveria aparecer 1x, apareceu ${occurrences}x`);
+  assert.ok(out.includes("paragrafo inicial"));
+  assert.ok(out.includes("paragrafo final"));
+});
+
+test("stitchDocument keeps order and joins non-overlapping chunks with a blank line", () => {
+  const a = `${HEADER}\n\nprimeira parte distinta.`;
+  const b = `${HEADER}\n\nsegunda parte distinta.`;
+  const out = stitchDocument([a, b]);
+  assert.ok(out.indexOf("primeira parte") < out.indexOf("segunda parte"));
+  assert.ok(out.includes("primeira parte distinta.\n\nsegunda parte distinta."));
+});
diff --git a/src/rag/__tests__/storage.test.ts b/src/rag/__tests__/storage.test.ts
@@ -15,6 +15,7 @@ import {
   getStatus,
   getBrainCounts,
   listBrainDocuments,
+  getDocumentChunks,
   titleFromHeaderLine,
   __setPoolForTest,
 } from "../storage.js";
@@ -41,6 +42,92 @@ after(async () => {
   await closePool();
 });
 
+test("searchKeyword uses the SAME ts config as the indexed tsv column (portuguese_unaccent)", async () => {
+  // The tsv GENERATED column is to_tsvector('portuguese_unaccent', text)
+  // (migrations 0001/0002). The query MUST use the same config; using
+  // 'portuguese' silently loses accent-insensitive matching on the query side.
+  // Runs WITHOUT a DB by capturing the SQL via an injected pool.
+  let captured = "";
+  __setPoolForTest({
+    query: async (sql: unknown) => {
+      captured = String(sql);
+      return { rows: [] } as never;
+    },
+  });
+  try {
+    await searchKeyword("ervilha", undefined, 10);
+  } finally {
+    __setPoolForTest(null);
+  }
+  assert.match(captured, /plainto_tsquery\('portuguese_unaccent',\s*\$1\)/);
+  assert.doesNotMatch(captured, /plainto_tsquery\('portuguese',\s*\$1\)/);
+});
+
+test("getDocumentChunks pins account_id, orders by chunk_index, and maps rows", async () => {
+  // Full-document fetch (powers brain_get_document). Multi-tenant: account_id is
+  // ALWAYS in the WHERE (never from input). Captures SQL via an injected pool.
+  let sql = "";
+  let params: unknown[] = [];
+  __setPoolForTest({
+    query: async (q: unknown, p: unknown) => {
+      sql = String(q);
+      params = p as unknown[];
+      return {
+        rows: [
+          {
+            id: "doc-0",
+            source_type: "notion",
+            source_id: "page-1",
+            workspace: "nora",
+            db_name: "Roteiros",
+            parent_url: "https://notion.so/page-1",
+            chunk_index: 0,
+            text: "primeiro",
+            metadata: {},
+            source_updated: null,
+          },
+        ],
+      } as never;
+    },
+  });
+  let chunks;
+  try {
+    chunks = await getDocumentChunks({
+      sourceId: "page-1",
+      accountId: "acct-1",
+      allowedWorkspaces: ["nora"] as never,
+    });
+  } finally {
+    __setPoolForTest(null);
+  }
+  assert.match(sql, /account_id\s*=\s*\$/i, "account_id deve estar no WHERE");
+  assert.match(sql, /order by\s+chunk_index/i, "deve ordenar por chunk_index");
+  assert.ok(params.includes("acct-1"), "accountId deve ir nos params");
+  assert.equal(chunks.length, 1);
+  assert.equal(chunks[0].source_id, "page-1");
+  assert.equal(chunks[0].text, "primeiro");
+});
+
+test("getDocumentChunks with an empty allowed-workspace scope yields zero rows (no leak)", async () => {
+  let sql = "";
+  __setPoolForTest({
+    query: async (q: unknown) => {
+      sql = String(q);
+      return { rows: [] } as never;
+    },
+  });
+  try {
+    await getDocumentChunks({
+      sourceId: "page-1",
+      accountId: "acct-1",
+      allowedWorkspaces: [] as never,
+    });
+  } finally {
+    __setPoolForTest(null);
+  }
+  assert.match(sql, /workspace\s*=\s*any/i, "scope vazio deve compilar para workspace = ANY (zero rows)");
+});
+
 test("upsertChunks inserts and re-upsert updates", async () => {
   if (!HAS_PG) {
     console.log("skipping: no POSTGRES_URL");