diff --git a/docs/issue-692-ast-chunking-design.md b/docs/issue-692-ast-chunking-design.md new file mode 100644 index 00000000..11653348 --- /dev/null +++ b/docs/issue-692-ast-chunking-design.md @@ -0,0 +1,256 @@ +# Issue #692 — AST-based Semantic Chunking for Code Blocks + +**Status:** Designed +**Repo:** `memory-lancedb-pro` +**Created:** 2026-05-04 +**Source:** https://github.com/CortexReach/memory-lancedb-pro/issues/692 + +--- + +## Problem Summary + +`chunker.ts` 的 `smartChunk()` 使用純 character-based split,split 邏輯在 `findSplitEnd()`: +- 先找 sentence ending(`.!!?`) +- 找不到 → 找 `\n` +- 找不到 → 找 whitespace + +這對自然語言有效,但對程式碼是災難。JS/TS 函式結尾是 `;` 和 `}`,兩者都不在 target set,導致 function declaration 在 `{` / `}` 之間被隨機切斷。 + +**真實破壞案例:** +``` +Chunk A(~3800字): +"async function handleUserLogin(userId: string, credentials: LoginCredentials): Promise {\n" +" const user = await this.userRepository.findById(userId);\n" +" if (!user) {\n" +" return { success: false, error: 'USER_NOT_FOUND' };" + +Chunk B(~900字): +" }\n" +" const passwordValid = await this.verifyPassword(...);" // verifyPassword 跨 Chunk A 和 B +``` + +**問題:** +- Chunk A 結尾在 `return { success: false, error: 'USER_NOT_FOUND' };` — 不完整的 if-block +- Chunk B 開頭是 `}` — 脫離語境的 closing brace +- `verifyPassword` 函式定義被切成兩段 + +--- + +## Verified Facts (gitnexus + source reading) + +### Call Graph (gitnexus verified) +``` +smartChunk (chunker.ts:263-281) + ├─ calls: getCjkRatio (174-183), chunkDocument (194-255) + │ + └─ called by: + ├─ embedSingle (embedder.ts) + ├─ embedMany (embedder.ts) + ├─ testCjkAwareChunkSizing (test/cjk-recursion-regression.test.mjs) + └─ testSmallContextChunking (test/cjk-recursion-regression.test.mjs) + +chunkDocument (chunker.ts:194-255) + ├─ calls: findSplitEnd (97-143), sliceTrimWithIndices (146-163) + └─ called by: smartChunk + +findSplitEnd (chunker.ts:97-143) ← 問題根因所在 +``` + +### Existing Coverage +- **測試:** 只有 `test/cjk-recursion-regression.test.mjs` 呼叫 `smartChunk`,**沒有任何專門測試 chunker 破壞案例的測試檔案** +- **依賴:** 無 tree-sitter +- **Config:** `maxChunkSize`, `overlapSize`, `minChunkSize`, `semanticSplit`, `maxLinesPerChunk` + +--- + +## Solution: astChunk() + +### Architecture + +``` +smartChunk(text) + ├─ detectCodeLanguage(text) === null → chunkDocument() [現有 character split] + └─ detectCodeLanguage(text) === 'js'/'ts' → astChunk(text, lang, config) + === 'py' → astChunk(text, 'python', config) + === 其他 → chunkDocument() [fallback] +``` + +### 1. `detectCodeLanguage(text) → CodeLanguage | null` + +取前 200 字做偵測: + +| 語言 | Pattern | +|------|---------| +| JS/TS | `/\b(function\|const\s\|let\s\|var\s\|=>\|import\s\|export\s\|interface\s\|type\s\|class\s)/` | +| Python | `/\bdef\s\|class\s\|import\s\|from\s\|print\(/` | +| Go | `/\bfunc\s\|package\s\|import\s"/` | +| Rust | `/\bfn\s\|impl\s\|pub\s\|let\s+mut\s/` | + +### 2. `astChunk(code, language, config) → ChunkResult` + +```typescript +import Parser from 'tree-sitter'; +import JavaScript from 'tree-sitter-javascript'; + +export function astChunk( + code: string, + language: CodeLanguage, + config: ChunkerConfig +): ChunkResult { + const parser = new Parser(); + switch (language) { + case 'javascript': + case 'typescript': + parser.setLanguage(JavaScript); + break; + case 'python': + parser.setLanguage(Python); + break; + default: + return chunkDocument(code, config); + } + + const tree = parser.parse(code); + const chunks: string[] = []; + const metadatas: ChunkMetadata[] = []; + + // Walk top-level nodes + const root = tree.rootNode; + for (const child of root.children) { + if (!isDeclarationNode(child)) continue; + const text = code.slice(child.startIndex, child.endIndex); + if (text.length <= config.maxChunkSize) { + chunks.push(text); + metadatas.push({ startIndex: child.startIndex, endIndex: child.endIndex, length: text.length }); + } else { + // Sub-split within this declaration at statement level + const subResult = subChunk(text, config); + chunks.push(...subResult.chunks); + metadatas.push(...subResult.metadatas); + } + } + + return { chunks, metadatas, totalOriginalLength: code.length, chunkCount: chunks.length }; +} +``` + +### 3. Supported Node Types (Phase 1) + +| 語言 | P0 節點 | +|------|---------| +| JS/TS | `function_declaration`, `arrow_function`, `class_declaration`, `method_definition`, `export_statement`, `interface_declaration`, `type_alias_declaration`, `lexical_declaration` | +| Python | `function_definition`, `class_definition`, `decorated_definition` | +| Go | `function_declaration`, `method_declaration` (P2) | +| Rust | `function_item`, `impl_item` (P2) | + +### 4. Config Extension + +```typescript +interface ChunkerConfig { + // ... 現有五個欄位 ... + astAwareCodeSplit?: boolean; // NEW: default true +} +``` + +### 5. Dependency Changes + +```json +{ + "dependencies": { + "tree-sitter": "^0.21.1", + "tree-sitter-javascript": "^0.21.0", + "tree-sitter-python": "^0.21.0" + } +} +``` + +--- + +## Files to Change + +| 檔案 | 變更 | +|------|------| +| `src/chunker.ts` | + `detectCodeLanguage()`, + `astChunk()`, + `subChunk()`, 修改 `smartChunk()` 路由, + `astAwareCodeSplit` config | +| `src/chunker.test.ts` | **全新建立**(從破壞案例反轉)| +| `package.json` | + tree-sitter, tree-sitter-javascript, tree-sitter-python | + +--- + +## Tests (New File) + +```typescript +describe('AST-aware code chunking', () => { + it('should keep { and } balanced in every chunk', () => { + const code = `async function handleUserLogin(userId: string) { + const user = await this.userRepository.findById(userId); + if (!user) { return { success: false }; } + const session = await this.createSession(user); + return { success: true, session }; +} +async function verifyPassword(input: string): Promise { + return bcrypt.compare(input, this.hash); +}`; + const result = smartChunk(code, 'jina-embeddings-v5'); + for (const chunk of result.chunks) { + const opens = (chunk.match(/{/g) || []).length; + const closes = (chunk.match(/}/g) || []).length; + expect(opens).toBe(closes); + } + }); + + it('should not split function mid-body', () => { + const result = smartChunk(code, 'jina-embeddings-v5'); + const hasMiddleOfFunction = result.chunks.some(c => + c.startsWith('}') || c.endsWith('{') + ); + expect(hasMiddleOfFunction).toBe(false); + }); + + it('should keep complete function as one chunk', () => { + const result = smartChunk(code, 'jina-embeddings-v5'); + const verifyFn = result.chunks.find(c => c.includes('verifyPassword')); + expect(verifyFn).toBeDefined(); + expect(verifyFn).toContain('bcrypt.compare'); + expect(verifyFn).not.toContain('handleUserLogin'); + }); +}); +``` + +--- + +## Phase Plan + +``` +Phase 1(P0 — MVP): + ├─ detectCodeLanguage()(JS/TS/Python) + ├─ astChunk() — JS/TS only + ├─ astChunk() — Python + ├─ Unit tests(破壞案例 → 通過案例) + └─ Config: astAwareCodeSplit default = true + +Phase 2(P1): + ├─ Sub-split within oversized declarations(statement level) + ├─ Go、Rust support + └─ Benchmark: 向量品質 vs. character split + +Phase 3(P2): + └─ Embedding quality evaluation(問答對比) +``` + +--- + +## Q&A + +| Q | A | +|---|---| +| tree-sitter 值得嗎? | **值得**。~1MB runtime,sub-ms parse,能處理巢狀結構/decorator/subclass,比 regex 精準一個數量級。 | +| 預設開? | **預設開**。破壞案例太明確,等使用者手動開等於功能永遠不被用。`astAwareCodeSplit: false` 保留給需要復現舊行為的測試。 | +| 非主流語言? | **Phase 1 fallback**。現有 sentence-ending split 對自然語言有效;非主流語言佔比低,Phase 1 fallback 合理。 | + +--- + +## Reference + +- Issue: https://github.com/CortexReach/memory-lancedb-pro/issues/692 +- Reference impl: `zilliztech/claude-context` ast-splitter.ts +- Existing chunker: `src/chunker.ts` (284 lines) diff --git a/package-lock.json b/package-lock.json index ee4ecef2..c5d6f5d2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,7 +14,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "devDependencies": { "commander": "^14.0.0", @@ -430,6 +433,26 @@ "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", "license": "MIT" }, + "node_modules/node-addon-api": { + "version": "8.7.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.7.0.tgz", + "integrity": "sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==", + "license": "MIT", + "engines": { + "node": "^18 || ^20 || >= 21" + } + }, + "node_modules/node-gyp-build": { + "version": "4.8.4", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz", + "integrity": "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==", + "license": "MIT", + "bin": { + "node-gyp-build": "bin.js", + "node-gyp-build-optional": "optional.js", + "node-gyp-build-test": "build-test.js" + } + }, "node_modules/openai": { "version": "6.22.0", "resolved": "https://registry.npmjs.org/openai/-/openai-6.22.0.tgz", @@ -517,6 +540,55 @@ "node": ">=12.17" } }, + "node_modules/tree-sitter": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.25.0.tgz", + "integrity": "sha512-PGZZzFW63eElZJDe/b/R/LbsjDDYJa5UEjLZJB59RQsMX+fo0j54fqBPn1MGKav/QNa0JR0zBiVaikYDWCj5KQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.0", + "node-gyp-build": "^4.8.4" + } + }, + "node_modules/tree-sitter-javascript": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-javascript/-/tree-sitter-javascript-0.25.0.tgz", + "integrity": "sha512-1fCbmzAskZkxcZzN41sFZ2br2iqTYP3tKls1b/HKGNPQUVOpsUxpmGxdN/wMqAk3jYZnYBR1dd/y/0avMeU7dw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.1", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, + "node_modules/tree-sitter-python": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-python/-/tree-sitter-python-0.25.0.tgz", + "integrity": "sha512-eCmJx6zQa35GxaCtQD+wXHOhYqBxEL+bp71W/s3fcDMu06MrtzkVXR437dRrCrbrDbyLuUDJpAgycs7ncngLXw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.5.0", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, "node_modules/tslib": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", diff --git a/package.json b/package.json index fbcb9d98..c091d619 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "openclaw": { "extensions": [ diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index fc6435dc..16ea8ac9 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + // Issue #692: AST-based semantic chunking + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix @@ -62,6 +64,9 @@ export const CI_TEST_MANIFEST = [ // Issue #492 agentId validation tests { group: "core-regression", runner: "node", file: "test/agentid-validation.test.mjs", args: ["--test"] }, { group: "core-regression", runner: "node", file: "test/command-reflection-guard.test.mjs", args: ["--test"] }, + // Issue #693 extraction write validation tests + { group: "core-regression", runner: "node", file: "test/extraction-validation.test.mjs", args: ["--test"] }, + { group: "core-regression", runner: "node", file: "test/dedup-false-alarm.test.mjs", args: ["--test"] }, ]; export function getEntriesForGroup(group) { diff --git a/src/chunker.ts b/src/chunker.ts index 8bb4dee6..a6aca426 100644 --- a/src/chunker.ts +++ b/src/chunker.ts @@ -37,6 +37,8 @@ export interface ChunkerConfig { semanticSplit: boolean; /** Max lines per chunk before we try to split earlier on a line boundary. */ maxLinesPerChunk: number; + /** Use AST-aware splitting for code blocks (default: true). */ + astAwareCodeSplit?: boolean; } // Common embedding context limits (provider/model specific). These are typically @@ -188,6 +190,237 @@ function getCjkRatio(text: string): number { const CJK_CHAR_TOKEN_DIVISOR = 2.5; const CJK_RATIO_THRESHOLD = 0.3; +// ============================================================================ +// AST-aware Code Chunking +// ============================================================================ + +export type CodeLanguage = 'javascript' | 'typescript' | 'python' | 'go' | 'rust'; + +const CODE_LANGUAGE_PATTERNS: Array<{ pattern: RegExp; lang: CodeLanguage }> = [ + // Python: must check before JS (def/class are specific) + { + pattern: /\b(def\s|class\s|import\s|from\s|async\s+def\s|print\()/, + lang: 'python', + }, + // Go: func and package keywords + { + pattern: /\b(func\s|package\s|import\s")/, + lang: 'go', + }, + // Rust: fn/impl/pub are distinct + { + pattern: /\bfn\s|impl\s|pub\s|let\s+mut\s/, + lang: 'rust', + }, + // TypeScript: interface / type alias / : type annotations (check before JS 'function') + { + pattern: /\b(interface\s|type\s+|:\s*(?:string|number|boolean|unknown|never|any|void|object|Error|Promise|Record|Array|Map|Set)\b)/, + lang: 'typescript', + }, + // JavaScript / TypeScript: function, const/let/var, arrow, import/export, class + { + pattern: /\b(function|const\s|let\s|var\s|=>|import\s|export\s|class\s)/, + lang: 'javascript', + }, +]; + +/** + * Detect if text is code and return the language, or null if not code. + * Uses only the first 200 chars to avoid being misled by comments. + */ +export function detectCodeLanguage(text: string): CodeLanguage | null { + const sample = text.slice(0, 400); + for (const { pattern, lang } of CODE_LANGUAGE_PATTERNS) { + if (pattern.test(sample)) return lang; + } + return null; +} + +// Supported top-level declaration node types per language +const JS_DECLARATION_TYPES = new Set([ + 'function_declaration', + 'class_declaration', + 'method_definition', + 'arrow_function', + 'export_statement', + 'export_default_declaration', + 'interface_declaration', + 'type_alias_declaration', + 'lexical_declaration', // const/let declarations + 'variable_declaration', +]); + +const PYTHON_DECLARATION_TYPES = new Set([ + 'function_definition', + 'class_definition', + 'decorated_definition', +]); + +function isDeclarationNode(node: { type: string }, lang: CodeLanguage): boolean { + if (lang === 'javascript' || lang === 'typescript') { + return JS_DECLARATION_TYPES.has(node.type); + } + if (lang === 'python') return PYTHON_DECLARATION_TYPES.has(node.type); + return false; +} + +/** + * Sub-split an oversized declaration at the statement level. + * Falls back to chunkDocument for the sub-split logic. + */ +function subChunk(text: string, config: ChunkerConfig): ChunkResult { + // For now, fall back to the character-based chunker within an oversized declaration. + // This preserves the existing behavior for sub-chunks while ensuring top-level + // declarations (functions/classes) are kept intact. + return chunkDocument(text, config); +} + +/** + * AST-aware chunker for code. Parses the code with tree-sitter and splits + * on top-level declaration boundaries (function, class, etc.) instead of + * arbitrary character positions. + * + * NOTE: This function is synchronous to match the sync signature of smartChunk. + * tree-sitter is loaded via require() with a try-catch fallback. + */ +export function astChunk( + code: string, + language: CodeLanguage, + config: ChunkerConfig +): ChunkResult { + // Attempt to load tree-sitter and language grammars + let LanguageMap: Record; + // tree-sitter exports Parser as the default export (module.exports = Parser) + // eslint-disable-next-line @typescript-eslint/no-var-requires + let TreeSitterParser: any; + + try { + TreeSitterParser = require('tree-sitter'); + + if (language === 'javascript' || language === 'typescript') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const JavaScript = require('tree-sitter-javascript'); + LanguageMap = { javascript: JavaScript, typescript: JavaScript }; + } else if (language === 'python') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const Python = require('tree-sitter-python'); + LanguageMap = { python: Python }; + } else { + // Unsupported language — fall back + return chunkDocument(code, config); + } + } catch { + // tree-sitter not installed — fall back to character-based chunking + return chunkDocument(code, config); + } + + const parser = new TreeSitterParser(); + const chunks: string[] = []; + const metadatas: ChunkMetadata[] = []; + + // Set language on the parser + let languageSet = false; + for (const [, langModule] of Object.entries(LanguageMap)) { + try { + parser.setLanguage(langModule); + languageSet = true; + break; + } catch { + // try next language + } + } + + if (!languageSet) { + return chunkDocument(code, config); + } + + let tree: any; + try { + tree = parser.parse(code); + } catch { + return chunkDocument(code, config); + } + + const root = tree.rootNode; + + // If there are ERROR nodes at the top level, the language parser likely does not + // support this syntax (e.g., TypeScript interface parsed by tree-sitter-javascript). + // Fall back to chunkDocument to avoid producing broken/incomplete chunks. + const hasErrorNodes = root.children.some(c => c.type === 'ERROR'); + if (hasErrorNodes) { + return chunkDocument(code, config); + } + + // Collect non-declaration content (comments, imports, etc.) that would otherwise be lost. + // These are prepended to the next declaration chunk to preserve no-content-left-behind semantics. + let pendingNonDecl = ''; + + // Walk top-level children + for (const child of root.children) { + // Skip non-named nodes and ERROR nodes + if (!child.type || child.type === 'ERROR') continue; + + if (!isDeclarationNode(child, language)) { + // Collect non-declaration content (comments, imports, exports, etc.) + const text = code.slice(child.startIndex, child.endIndex); + if (text.length > 0) { + pendingNonDecl += (pendingNonDecl.length > 0 ? '\n' : '') + text; + } + continue; + } + + const text = code.slice(child.startIndex, child.endIndex); + + if (text.length === 0) continue; + + // Prepend any pending non-declaration content to this declaration chunk + const fullText = pendingNonDecl.length > 0 ? pendingNonDecl + '\n' + text : text; + pendingNonDecl = ''; // reset + + if (fullText.length <= config.maxChunkSize) { + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } else { + // Oversized declaration with prepended content. + // We accept that this chunk may exceed maxChunkSize — splitting + // mid-declaration would break { } balance (Issue #692). + // Sub-splitting at statement level is Phase 2 work. + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } + } + + // If there is trailing non-declaration content (e.g., trailing comments with no following decl), + // emit it as its own chunk (fall back to chunkDocument to handle sizing). + if (pendingNonDecl.length > 0) { + const trailing = chunkDocument(pendingNonDecl, config); + for (let i = 0; i < trailing.chunks.length; i++) { + chunks.push(trailing.chunks[i]); + metadatas.push(trailing.metadatas[i]); + } + } + + // If we got nothing (e.g. empty file, parse error), fall back + if (chunks.length === 0) { + return chunkDocument(code, config); + } + + return { + chunks, + metadatas, + totalOriginalLength: code.length, + chunkCount: chunks.length, + }; +} + // ============================================================================ // Chunking Core // ============================================================================ @@ -276,8 +509,17 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult { minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)), semanticSplit: true, maxLinesPerChunk: 50, + astAwareCodeSplit: true, }; + // AST-aware code path: only activate when explicitly enabled + if (config.astAwareCodeSplit === true) { + const lang = detectCodeLanguage(text); + if (lang !== null) { + return astChunk(text, lang, config); + } + } + return chunkDocument(text, config); } diff --git a/src/memory-categories.ts b/src/memory-categories.ts index 7edc7f53..af6604e9 100644 --- a/src/memory-categories.ts +++ b/src/memory-categories.ts @@ -76,6 +76,24 @@ export type ExtractionStats = { superseded?: number; // temporal fact replacements }; +/** + * Payload delivered to `ExtractPersistOptions.onExtractionValidationFailed` + * when the number of entries actually written to the store differs from + * the number of candidates produced by the LLM. + * + * @see ExtractPersistOptions.onExtractionValidationFailed + */ +export type ExtractionValidation = { + /** Number of candidates the LLM intended to create (createEntries.length) */ + expected: number; + /** Number of rows actually written (countAfter - countBefore) */ + actual: number; + /** expected - actual; positive = under-write, negative = over-write (concurrent delete) */ + mismatch: number; + /** Session key passed to extractAndPersist */ + sessionKey: string; +}; + /** Validate and normalize a category string. */ export function normalizeCategory(raw: string): MemoryCategory | null { const lower = raw.toLowerCase().trim(); diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 11354ae6..e8e4da16 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -273,6 +273,23 @@ export interface ExtractPersistOptions { * - pass a non-empty array to restrict reads to those scopes */ scopeFilter?: string[]; + /** + * Callback invoked when the number of entries actually written to the store + * differs from the number of LLM-generated candidates. + * + * This detects write-path anomalies including: + * - SIGKILL / OOM during bulkStore (partial write) + * - Concurrent compactor deletions (count window between bulkStore and count()) + * + * The callback is NOT invoked when createEntries is empty (no-op write). + * The mismatch field: positive = under-write, negative = over-write (rare). + */ + onExtractionValidationFailed?: (validation: { + expected: number; + actual: number; + mismatch: number; + sessionKey: string; + }) => void; } export class SmartExtractor { @@ -441,7 +458,30 @@ export class SmartExtractor { } if (createEntries.length > 0) { + // Phase 1: Verify write success via count-before/after. + // bulkStore is atomic per entry (randomUUID + table.add), so any + // discrepancy here indicates partial failure (SIGKILL/OOM). + // LIMITATION: This count-diff approach is only reliable under single- + // process assumption (no concurrent compactor/session writes during + // extractAndPersist). Concurrent writes inflate actualCreated and can + // cause false negatives (mismatch not detected). This is acceptable for + // Phase 1 since plugin architecture guarantees single-process extraction. + // Phase 2 will address concurrent-safe validation (UUID list compare). + const countBefore = await this.store.count(); await this.store.bulkStore(createEntries); + const countAfter = await this.store.count(); + const actualCreated = countAfter - countBefore; + const expectedCreated = createEntries.length; + + if (actualCreated !== expectedCreated) { + const mismatch = expectedCreated - actualCreated; + options.onExtractionValidationFailed?.({ + expected: expectedCreated, + actual: actualCreated, + mismatch, + sessionKey, + }); + } } return stats; diff --git a/test/ast-code-chunking.test.mjs b/test/ast-code-chunking.test.mjs new file mode 100644 index 00000000..6d0fa2ca --- /dev/null +++ b/test/ast-code-chunking.test.mjs @@ -0,0 +1,296 @@ +/** + * AST-aware Code Chunking Tests (Issue #692) + * + * Verifies that code declarations (functions, classes) are NOT split mid- + * declaration, which was breaking { } balance when the old character-based + * splitter cut through the middle of a function body. + */ + +import { describe, it, mock, beforeEach } from 'node:test'; +import assert from 'node:assert/strict'; +import jitiFactory from 'jiti'; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { detectCodeLanguage, astChunk, smartChunk, chunkDocument, DEFAULT_CHUNKER_CONFIG } = jiti('../src/chunker.ts'); + +// ============================================================================ +// detectCodeLanguage +// ============================================================================ + +describe('detectCodeLanguage', () => { + it('detects JavaScript function', () => { + const code = 'async function handleUserLogin(userId, password) {'; + assert.equal(detectCodeLanguage(code), 'javascript'); + }); + + it('detects TypeScript interface', () => { + const code = 'interface UserProfile { name: string; age: number; }'; + assert.equal(detectCodeLanguage(code), 'typescript'); + }); + + it('detects Python function', () => { + const code = 'def verify_password(password: str, hashed: bytes) -> bool:'; + assert.equal(detectCodeLanguage(code), 'python'); + }); + + it('detects Go function', () => { + const code = 'func handleLogin(w http.ResponseWriter, r *http.Request) {'; + assert.equal(detectCodeLanguage(code), 'go'); + }); + + it('detects Rust function', () => { + const code = 'fn verify_password(password: &str, hash: &str) -> bool {'; + assert.equal(detectCodeLanguage(code), 'rust'); + }); + + it('returns null for plain text', () => { + const text = 'This is a plain English sentence with no code markers.'; + assert.equal(detectCodeLanguage(text), null); + }); + + it('returns null for Markdown prose', () => { + const md = '# Heading\n\nThis is a paragraph with **bold** text.'; + assert.equal(detectCodeLanguage(md), null); + }); + + it('uses only first 400 chars to avoid comment noise', () => { + // Short comment so 'function' appears within first 400 chars of the sample + const commentLine = '// This is a comment\n'; // 20 chars + const code = commentLine.repeat(15) + 'function foo() {}'; // ~300 + function + assert.equal(detectCodeLanguage(code), 'javascript'); + }); +}); + +// ============================================================================ +// Brace balance helper +// ============================================================================ + +/** Count net open braces inside a string. */ +function braceDelta(s) { + let d = 0; + for (const ch of s) { + if (ch === '{') d++; + else if (ch === '}') d--; + } + return d; +} + +/** Check all chunks are brace-balanced. */ +function assertBraceBalanced(chunks, label) { + const deltas = chunks.map(c => braceDelta(c)); + const total = deltas.reduce((a, b) => a + b, 0); + assert.equal(total, 0, `${label}: unbalanced braces across chunks (net=${total}, deltas=${JSON.stringify(deltas)})`); + for (let i = 0; i < deltas.length; i++) { + assert(deltas[i] >= 0, + `${label}: chunk[${i}] closes more braces than it opens (delta=${deltas[i]})`); + } +} + +// ============================================================================ +// Issue #692 — core destructive cases +// ============================================================================ + +describe('Issue #692: code functions must not be split mid-declaration', () => { + + it('verifies that a simple async function is kept whole', () => { + const code = `async function verifyPassword(password, hash) { + const match = await bcrypt.compare(password, hash); + return match; +}`; + + // Very small maxChunkSize to force splitting — old splitter would cut mid-function + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 60, minChunkSize: 10, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Function body should not be split mid-declaration + const splitInsideFunction = result.chunks.some(chunk => { + // Should not have "{" without corresponding "}" + const d = braceDelta(chunk); + return d > 0; // opens braces but never closes + }); + assert.ok(!splitInsideFunction, 'Should not split inside a function declaration'); + assertBraceBalanced(result.chunks, 'verifyPassword'); + }); + + it('verifies that a long function is NOT split mid-function (maxChunkSize < function length)', () => { + // This function is ~250 chars — set maxChunkSize=120 to force the issue. + // Oversized functions are kept as ONE atomic chunk (no mid-function split). + const code = `async function handleUserLogin(userId, password) { + const user = await db.users.findOne({ id: userId }); + if (!user) throw new Error('User not found'); + const match = await bcrypt.compare(password, user.hash); + return match; +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 120, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'handleUserLogin'); + // Should be 1 chunk — entire function kept intact + assert.ok(result.chunks.length === 1, `Expected 1 chunk (entire function), got ${result.chunks.length}`); + }); + + it('verifies that multiple small functions are each kept whole', () => { + const code = `async function verifyPassword(password, hash) { + return await bcrypt.compare(password, hash); +} + +async function hashPassword(password) { + return await bcrypt.hash(password, 10); +} + +export async function createUser(name, email, password) { + const hash = await hashPassword(password); + return await db.users.create({ name, email, hash }); +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 150, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'multiple functions'); + // All three functions should appear intact in some chunk + assert.ok(result.chunks.some(c => c.includes('function verifyPassword')), 'verifyPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function hashPassword')), 'hashPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function createUser')), 'createUser missing'); + }); + + it('smartChunk: entire JavaScript file with functions stays brace-balanced', () => { + const code = `const SPEC = { + name: 'auth', + version: '1.0.0', +}; + +async function login(email, password) { + const user = await db.findUser(email); + const ok = await bcrypt.compare(password, user.hash); + if (!ok) throw new Error('Invalid credentials'); + return { token: signToken(user.id) }; +} + +async function logout(token) { + invalidateToken(token); +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk JS'); + }); + + it('smartChunk: Python function stays syntactically coherent', () => { + const code = `def verify_password(password: str, hashed: bytes) -> bool: + return pwd_context.verify(password, hashed) + +def hash_password(password: str) -> str: + return pwd_context.hash(password)`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + // Python chunks should contain complete function definitions + assert.ok(result.chunks.every(c => c.trim().length > 0), 'No empty chunks'); + }); +}); + +// ============================================================================ +// astChunk — fallback & edge cases +// ============================================================================ + +describe('astChunk fallback behavior', () => { + + it('falls back to chunkDocument when tree-sitter throws', () => { + // Pass an empty string to force parse error + const code = ''; + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'javascript', config); + // Should return a valid ChunkResult (fallback path) + assert.ok('chunks' in result); + assert.ok('chunkCount' in result); + }); + + it('returns chunkDocument result when language is unsupported', () => { + const code = 'fn main() {}'; // not JS/TS/Python + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'rust', config); + // Rust is not yet supported in astChunk — falls back + assert.ok('chunks' in result); + }); + + it('handles an oversized single declaration as one atomic chunk (brace-balanced)', () => { + // A very long function that exceeds maxChunkSize — should stay as ONE chunk + const body = ' return x + y;\n'.repeat(200); + const code = `function processData(x, y) {\n${body}}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 200, minChunkSize: 50, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Should be 1 chunk — entire function kept as one + assert.ok(result.chunks.length === 1, `Expected 1 chunk, got ${result.chunks.length}`); + assertBraceBalanced(result.chunks, 'oversized function atomic chunk'); + }); +}); + +// ============================================================================ +// smartChunk — non-code text unchanged +// ============================================================================ + +describe('smartChunk preserves non-code behavior', () => { + + it('passes plain English text to chunkDocument (not astChunk)', () => { + const text = 'This is a plain English paragraph. It has sentences. They end with periods. '.repeat(30); + + const result = smartChunk(text, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + // Plain text should be split on sentence boundaries (semanticSplit=true default) + }); + + it('passes Markdown prose to chunkDocument', () => { + const md = '# Title\n\nThis is a paragraph.\n\n## Section\n\nAnother paragraph here.\n'.repeat(20); + + const result = smartChunk(md, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + assert.equal(detectCodeLanguage(md), null, 'Markdown should not be detected as code'); + }); +}); + +// ============================================================================ +// TypeScript interface chunking +// ============================================================================ + +describe('TypeScript interfaces and types', () => { + + it('smartChunk: TypeScript interface stays balanced (via smartChunk, not direct astChunk)', () => { + // Note: tree-sitter-javascript cannot fully parse TS interface declarations as one unit. + // When astChunk falls back to chunkDocument for an oversized TS interface, + // it may produce multiple chunks. smartChunk avoids this by using a large + // enough maxChunkSize that the whole interface fits in one chunk. + const code = `interface UserProfile { + id: string; + name: string; + email: string; + createdAt: Date; + metadata?: Record; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + // The interface declaration should produce at least one chunk + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + assertBraceBalanced(result.chunks, 'smartChunk TS interface'); + assert.ok(result.chunks.some(c => c.includes('interface UserProfile')), 'interface should be present'); + }); + + it('smartChunk on TypeScript stays balanced', () => { + const code = `type UserID = string; + +interface Config { + apiKey: string; + timeout: number; +} + +function getConfig(): Config { + return { apiKey: process.env.KEY, timeout: 5000 }; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk TS'); + }); +}); diff --git a/test/dedup-false-alarm.test.mjs b/test/dedup-false-alarm.test.mjs new file mode 100644 index 00000000..80d77ee0 --- /dev/null +++ b/test/dedup-false-alarm.test.mjs @@ -0,0 +1,165 @@ +/** + * P0 驗證測試:確認 bulkStore 不會因 batchDedup 去重 + * + * 問題:是否 bulkStore 內部呼叫 batchDedup,導致 near-duplicate entries + * 被錯誤過濾,造成 countAfter - countBefore < createEntries.length? + * + * 測試策略: + * 1. 直接構造兩個 cosine similarity = 0.95 的向量(保證是 near-duplicate) + * 2. 用 batchDedup 確認它們確實被視為 duplicates + * 3. 透過 bulkStore 寫入這兩個 entry + * 4. 驗證 count 增加 2(而非 1)→ 確認 bulkStore 不去重 + */ + +import { describe, it, afterEach } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { MemoryStore } = jiti("../src/store.ts"); +const { batchDedup } = jiti("../src/batch-dedup.ts"); + +/** 256-dim constant vector with a controlled cosine similarity to a base vector */ +function makeNearDuplicateVector(baseVec, similarity = 0.95) { + const dim = baseVec.length; + // Scale factor: cosine = scale * |base| / |target| = scale (since |base|=|target|=1) + // Actually: cos(base, target) = scale when target = scale * base + orth + const scale = similarity; + return baseVec.map(v => v * scale); +} + +/** Two identical vectors — maximum similarity */ +function makeIdenticalVector(dim = 256) { + const rng = makeRng(42); + return Array.from({ length: dim }, () => rng()); +} + +/** Seeded LCG RNG for deterministic vectors */ +function makeRng(seed) { + let s = seed >>> 0; + return () => { + s = (Math.imul(1664525, s) + 1013904223) >>> 0; + return s / 0x100000000; + }; +} + +/** Cosine similarity between two vectors */ +function cosineSimilarity(a, b) { + let dot = 0, normA = 0, normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} + +const TEST_DB_PREFIX = "/tmp/test-dedup-p0-"; + +describe("P0: bulkStore does NOT deduplicate near-duplicate entries", () => { + /** @type {MemoryStore} */ + let store; + let dbPath; + + afterEach(async () => { + if (store) { + try { await store.deleteAll("test-session"); } catch {} + try { await store.destroy(); } catch {} + } + }); + + it("bulkStore writes both near-duplicate entries (cosine = 0.95)", async () => { + // Step 1: Create base vector and a near-duplicate (cosine = 0.95) + const dim = 256; + const baseVec = makeIdenticalVector(dim); + const dupVec = makeNearDuplicateVector(baseVec, 0.95); + + const cosSim = cosineSimilarity(baseVec, dupVec); + console.log(`[P0] cosine similarity between base and near-duplicate: ${cosSim.toFixed(4)}`); + assert(cosSim > 0.94, `Near-duplicate should have cosine > 0.94, got ${cosSim}`); + + // Step 2: Verify batchDedup marks one as duplicate + const dedupResult = batchDedup( + ["abstract one", "abstract two"], + [baseVec, dupVec], + 0.85 // default threshold + ); + console.log(`[P0] batchDedup: ${dedupResult.inputCount} → ${dedupResult.outputCount}, duplicates=${JSON.stringify(dedupResult.duplicateIndices)}`); + assert(dedupResult.outputCount < dedupResult.inputCount, + `batchDedup should mark one as duplicate (input=${dedupResult.inputCount}, output=${dedupResult.outputCount})`); + + // Step 3: Create MemoryStore and write both via bulkStore + dbPath = TEST_DB_PREFIX + Date.now() + "-1"; + store = new MemoryStore({ dbPath, vectorDim: dim }); + const countBefore = await store.count(); + + await store.bulkStore([ + { + text: "Meeting attendance — quarterly business review", + vector: baseVec, + category: "fact", + scope: "test-session", + importance: 0.5, + metadata: JSON.stringify({ l0_abstract: "abstract one" }), + }, + { + text: "Quarterly business review with team lead", + vector: dupVec, + category: "fact", + scope: "test-session", + importance: 0.5, + metadata: JSON.stringify({ l0_abstract: "abstract two" }), + }, + ]); + + const countAfter = await store.count(); + const delta = countAfter - countBefore; + + console.log(`[P0 result] countBefore=${countBefore}, countAfter=${countAfter}, delta=${delta}`); + + // KEY ASSERTION: delta should be exactly 2 — bulkStore does NOT dedupe + assert.strictEqual(delta, 2, + `bulkStore should write both entries (delta=2), got delta=${delta}. ` + + `If delta=1, bulkStore is internally deduplicating near-duplicate entries — this is a P0 bug.`); + }); + + it("bulkStore writes all 5 entries even when batchDedup would reduce them to 1", async () => { + const dim = 256; + + // Create 5 identical vectors — batchDedup with threshold 0.85 will keep only 1 + const baseVec = makeIdenticalVector(dim); + const vectors = Array.from({ length: 5 }, () => baseVec); + + const dedupResult = batchDedup( + Array(5).fill("abstract"), + vectors, + 0.85 + ); + console.log(`[P0 batch] batchDedup: 5 identical vectors → ${dedupResult.outputCount} survivors`); + assert(dedupResult.outputCount < 5, + "Sanity: 5 identical vectors should produce < 5 survivors in batchDedup"); + + // Now write all 5 via bulkStore + dbPath = TEST_DB_PREFIX + Date.now() + "-2"; + store = new MemoryStore({ dbPath, vectorDim: dim }); + const countBefore = await store.count(); + + await store.bulkStore(vectors.map((v, i) => ({ + text: `Event ${i + 1}`, + vector: v, + category: "fact", + scope: "test-session", + importance: 0.5, + metadata: JSON.stringify({ l0_abstract: `abstract ${i}` }), + }))); + + const countAfter = await store.count(); + const delta = countAfter - countBefore; + + console.log(`[P0 batch result] countBefore=${countBefore}, countAfter=${countAfter}, delta=${delta}`); + + // KEY ASSERTION: all 5 should be written despite batchDedup saying they're duplicates + assert.strictEqual(delta, 5, + `bulkStore should write all 5 entries even if batchDedup would drop 4 of them (delta=5), got delta=${delta}`); + }); +}); diff --git a/test/extraction-validation.test.mjs b/test/extraction-validation.test.mjs new file mode 100644 index 00000000..1bd913ed --- /dev/null +++ b/test/extraction-validation.test.mjs @@ -0,0 +1,445 @@ +/** + * Test: Extraction Write Validation (Issue #693) + * + * Tests the countBefore/countAfter validation logic in extractAndPersist(). + * + * Key challenge: SmartExtractor's `batchDedup` uses cosine similarity (threshold 0.85) + * on embedded candidate abstracts to filter near-duplicates BEFORE the dedup pipeline. + * Simple charCodeAt-based vectors all score >0.85 similarity (common English letters). + * + * Solution: `DeterministicRandomEmbedder` generates 256-dim vectors using a + * seeded RNG (Mulberry32) keyed on the text content. Each distinct text produces + * a unique vector with cosine similarity < 0.85, ensuring candidates survive batchDedup. + * + * Validates: + * T1. Normal extraction: expected === actual, mismatch = 0, callback NOT triggered + * T2. Empty extraction: skipped, mismatch undefined (validation skipped) + * T3. Partial bulkStore failure: actual < expected → mismatch > 0, callback triggered + * T4. Post-write deletion (compactor race): actual < expected → mismatch > 0 + * T5. Callback is optional — no error if omitted even on mismatch + * T6. Multiple extractions each get independent validation state + */ + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { SmartExtractor } = jiti("../src/smart-extractor.ts"); + +// ============================================================================ +// Helpers +// ============================================================================ + +/** + * Mulberry32 seeded PRNG — fast, deterministic, good distribution. + * Returns a new RNG function seeded from an integer. + */ +function makeRng(seed) { + let s = seed >>> 0; + return () => { + s |= 0; + s = (s + 0x6d2b79f5) | 0; + let t = Math.imul(s ^ (s >>> 15), 1 | s); + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +/** + * Deterministic random embedder. + * + * Produces 256-dimensional vectors using a seeded RNG keyed on the text. + * Different texts → different seeds → vectors with cosine similarity < 0.85, + * ensuring candidates survive SmartExtractor's internal batchDedup filter. + * + * This lets us test with 2+ candidates without fighting the dedup logic. + */ +function makeDeterministicEmbedder() { + return { + async embed(text) { + const seed = [...text].reduce((acc, c) => acc + c.charCodeAt(0), 0) >>> 0; + const rng = makeRng(seed === 0 ? 1 : seed); + return Array.from({ length: 256 }, () => rng()); + }, + async embedBatch(texts) { + return texts.map((t) => { + const seed = [...t].reduce((acc, c) => acc + c.charCodeAt(0), 0) >>> 0; + const rng = makeRng(seed === 0 ? 1 : seed); + return Array.from({ length: 256 }, () => rng()); + }); + }, + }; +} + +/** + * Mock LLM — returns configurable candidates and "create" for all dedup decisions. + * The "create" decision ensures candidates progress through the dedup pipeline + * and reach bulkStore without special handling (no handleSupersede, handleMerge, etc.). + */ +function makeLlm(candidates = []) { + return { + async completeJson(_prompt, mode) { + if (mode === "extract-candidates") return { memories: candidates }; + if (mode === "dedup-decision") return { decision: "create", reason: "no match" }; + if (mode === "merge-memory") return candidates[0] ?? null; + return null; + }, + }; +} + +/** + * Mock store with configurable write behavior. + * + * Config: + * initialCount — starting row count (default 0) + * dropLastN — silently drop last N entries on bulkStore (partial write failure) + * bulkStoreThrows — throw on bulkStore (total write failure) + */ +function makeStore(config = {}) { + const { initialCount = 0, dropLastN = 0, bulkStoreThrows = false } = config; + let rowCount = initialCount; + const entries = []; + + const store = { + _config: config, + + async count() { return rowCount; }, + + async vectorSearch() { return []; }, + + async store(entry) { + rowCount++; + entries.push({ action: "store", id: entry.id ?? "?" }); + return { ...entry, id: "direct-id-" + entries.length }; + }, + + async bulkStore(batchEntries) { + if (bulkStoreThrows) throw new Error("bulkStore simulated failure"); + const stored = dropLastN > 0 + ? batchEntries.slice(0, batchEntries.length - dropLastN) + : batchEntries; + for (let i = 0; i < stored.length; i++) { + rowCount++; + entries.push({ action: "bulkStore", id: stored[i].id ?? "bulk-" + i }); + } + if (dropLastN > 0) entries.push({ action: "bulkStore_dropped", count: dropLastN }); + return stored.map((e, i) => ({ ...e, id: "bulk-id-" + i })); + }, + + async update(_id, _patch, _scopeFilter) { + entries.push({ action: "update", id: _id }); + }, + + async getById() { return null; }, + + async delete(_id) { + rowCount = Math.max(0, rowCount - 1); + entries.push({ action: "delete", id: _id }); + }, + + get entries() { return [...entries]; }, + get rowCount() { return rowCount; }, + reset() { rowCount = initialCount; entries.length = 0; }, + }; + return store; +} + +function makeExtractor(embedder, llm, store, config = {}) { + return new SmartExtractor(store, embedder, llm, { + user: "User", + extractMinMessages: 1, + extractMaxChars: 8000, + defaultScope: "global", + log() {}, + debugLog() {}, + ...config, + }); +} + +// ============================================================================ +// Tests +// ============================================================================ + +describe("Issue #693: Extraction write validation", () => { + + // -------------------------------------------------------------------------- + // T1: Normal extraction — expected === actual, mismatch = 0, no callback + // -------------------------------------------------------------------------- + it("T1: normal extraction passes validation with mismatch=0", async () => { + const embedder = makeDeterministicEmbedder(); + // Two semantically different abstracts → low cosine similarity → both survive batchDedup + const llm = makeLlm([ + { + category: "preferences", + abstract: "User prefers dark mode interface settings for eye comfort during coding", + overview: "Display preference", + content: "The user prefers dark mode interface settings on their workstation for reduced eye strain during extended coding sessions.", + }, + { + category: "entities", + abstract: "User works at Acme Corporation headquarters in the R&D division", + overview: "Employment information", + content: "The user is employed as a senior software engineer at Acme Corporation's research and development headquarters facility.", + }, + ]); + const store = makeStore({ initialCount: 0 }); + const extractor = makeExtractor(embedder, llm, store); + + let callbackInvoked = false; + let receivedValidation = null; + + const stats = await extractor.extractAndPersist( + "I use dark mode at work and work at Acme Corp", + "session-t1", + { + onExtractionValidationFailed(validation) { + callbackInvoked = true; + receivedValidation = validation; + }, + } + ); + + assert.strictEqual(stats.created, 2, "both candidates should be created"); + assert.strictEqual( + stats.validationMismatch, + undefined, + "validationMismatch should be undefined (not in public ExtractionStats)" + ); + assert.strictEqual(callbackInvoked, false, "callback should NOT be triggered"); + assert.strictEqual(store.rowCount, 2, "both entries should be written"); + assert.strictEqual(receivedValidation, null, "no validation object received"); + }); + + // -------------------------------------------------------------------------- + // T2: Empty extraction — no bulkStore, validation skipped + // -------------------------------------------------------------------------- + it("T2: empty extraction skips validation", async () => { + const embedder = makeDeterministicEmbedder(); + const llm = makeLlm([]); + const store = makeStore({ initialCount: 5 }); + const extractor = makeExtractor(embedder, llm, store); + + let callbackInvoked = false; + + const stats = await extractor.extractAndPersist( + "nothing to extract here", + "session-t2", + { onExtractionValidationFailed() { callbackInvoked = true; } } + ); + + assert.strictEqual(stats.created, 0, "no entries created"); + assert.strictEqual( + stats.validationMismatch, + undefined, + "validationMismatch should be undefined for empty extraction (validation skipped)" + ); + assert.strictEqual(callbackInvoked, false, "callback should NOT fire"); + assert.strictEqual(store.rowCount, 5, "pre-existing count unchanged"); + }); + + // -------------------------------------------------------------------------- + // T3: Partial bulkStore failure — actual < expected → mismatch > 0 + // -------------------------------------------------------------------------- + it("T3: partial bulkStore failure triggers mismatch > 0", async () => { + const embedder = makeDeterministicEmbedder(); + // 3 candidates → dropLastN=1 → actual=2, mismatch=1 + const llm = makeLlm([ + { + category: "preferences", + abstract: "User prefers dark mode interface settings for eye comfort during coding", + overview: "Display preference", + content: "The user prefers dark mode interface settings on their workstation for reduced eye strain during extended coding sessions.", + }, + { + category: "preferences", + abstract: "User prefers light theme when editing documents and writing emails", + overview: "Display preference", + content: "In contrast to dark mode, the user prefers light theme when editing documents and writing emails in their daily productivity workflow.", + }, + { + category: "entities", + abstract: "User works at Acme Corporation headquarters in the R&D division", + overview: "Employment information", + content: "The user is employed as a senior software engineer at Acme Corporation's research and development headquarters facility.", + }, + ]); + const store = makeStore({ initialCount: 0, dropLastN: 1 }); + const extractor = makeExtractor(embedder, llm, store); + + let callbackInvoked = false; + let receivedValidation = null; + + const stats = await extractor.extractAndPersist( + "I use dark mode for coding but light theme for writing emails at Acme Corp", + "session-t3", + { + onExtractionValidationFailed(validation) { + callbackInvoked = true; + receivedValidation = validation; + }, + } + ); + + // Expected = 3, Actual = 2 (dropLastN=1), Mismatch = 1 + // Note: validationMismatch is NOT written to stats (removed from public API) + // Only the callback receives the mismatch information + assert.strictEqual(callbackInvoked, true, "callback SHOULD be triggered"); + assert.ok(receivedValidation); + assert.strictEqual(receivedValidation.expected, 3); + assert.strictEqual(receivedValidation.actual, 2); + assert.strictEqual(receivedValidation.mismatch, 1); + assert.strictEqual(receivedValidation.sessionKey, "session-t3"); + assert.strictEqual(store.rowCount, 2, "only 2 rows written"); + }); + + // -------------------------------------------------------------------------- + // T4: Post-write deletion (compactor race) — actual < expected + // -------------------------------------------------------------------------- + it("T4: post-write deletion triggers mismatch > 0 (compactor race)", async () => { + const embedder = makeDeterministicEmbedder(); + // 2 candidates → compactor deletes 1 after bulkStore → actual=1, mismatch=1 + const llm = makeLlm([ + { + category: "cases", + abstract: "User completed initial setup wizard on first launch of the application", + overview: "Setup wizard completion", + content: "The user has successfully completed the initial setup wizard and application onboarding process during their first launch of the software application on their primary workstation.", + }, + { + category: "cases", + abstract: "User configured notification preferences including email and push alerts", + overview: "Notification settings configuration", + content: "Following the initial setup, the user proceeded to configure various notification preferences including email alerts, desktop push notifications, and mobile synchronization settings.", + }, + ]); + const store = makeStore({ initialCount: 0 }); + const extractor = makeExtractor(embedder, llm, store); + + // Simulate compactor deleting 1 entry after bulkStore succeeds + const originalBulkStore = store.bulkStore.bind(store); + store.bulkStore = async (entries) => { + const result = await originalBulkStore(entries); + await store.delete("bulk-id-0"); // compactor race: delete first entry + return result; + }; + + let callbackInvoked = false; + let receivedValidation = null; + + const stats = await extractor.extractAndPersist( + "I completed setup and configured notification preferences", + "session-t4", + { + onExtractionValidationFailed(validation) { + callbackInvoked = true; + receivedValidation = validation; + }, + } + ); + + // Expected = 2, Actual = 1 (compactor deleted 1), Mismatch = 1 + // Note: validationMismatch is NOT written to stats (removed from public API) + assert.strictEqual(callbackInvoked, true, "callback SHOULD be triggered"); + assert.ok(receivedValidation); + assert.strictEqual(receivedValidation.expected, 2); + assert.strictEqual(receivedValidation.actual, 1); + assert.strictEqual(receivedValidation.mismatch, 1); + assert.strictEqual(receivedValidation.sessionKey, "session-t4"); + assert.strictEqual(store.rowCount, 1, "1 row remaining after deletion"); + }); + + // -------------------------------------------------------------------------- + // T5: Callback is optional — no error if omitted + // -------------------------------------------------------------------------- + it("T5: callback is optional — no error if omitted even on mismatch", async () => { + const embedder = makeDeterministicEmbedder(); + // 2 candidates that survive batchDedup, with dropLastN=1 + const llm = makeLlm([ + { + category: "events", + abstract: "User attended quarterly business review meeting with the team lead", + overview: "Meeting attendance", + content: "The quarterly business review meeting was attended by the user along with their direct team members to discuss ongoing project status and future planning initiatives.", + }, + { + category: "events", + abstract: "User participated in a formal code review session with constructive feedback", + overview: "Code review participation", + content: "The user actively participated in a formal code review session where they provided constructive feedback on pull request implementations and discussed architectural decision implications.", + }, + ]); + const store = makeStore({ initialCount: 0, dropLastN: 1 }); + const extractor = makeExtractor(embedder, llm, store); + + // Should NOT throw even though mismatch occurs and callback is absent + // Note: validationMismatch is NOT written to stats (exposed only via callback) + await extractor.extractAndPersist( + "User said: I attended the quarterly business review and participated in a code review", + "session-t5", + {} // no callback + ); + assert.strictEqual(store.rowCount, 1, "1 row written despite mismatch (dropLastN=1)"); + }); + + // -------------------------------------------------------------------------- + // T6: Multiple extractions — independent validation state + // -------------------------------------------------------------------------- + it("T6: multiple extractions each get independent validation", async () => { + const embedder = makeDeterministicEmbedder(); + + // First extraction: normal (no mismatch) + const llm1 = makeLlm([{ + category: "events", + abstract: "User attended quarterly business review meeting with the team lead", + overview: "Meeting", + content: "The quarterly business review meeting was attended by the user along with their direct team members to discuss ongoing project status and future planning initiatives.", + }]); + const store1 = makeStore({ initialCount: 0 }); + const extractor1 = makeExtractor(embedder, llm1, store1); + store1.reset(); + + const validations = []; + + const stats1 = await extractor1.extractAndPersist( + "User said: I attended the quarterly business review", + "session-multi-1", + { onExtractionValidationFailed(v) { validations.push(v); } } + ); + + assert.strictEqual(stats1.validationMismatch, undefined, "first: validationMismatch undefined"); + assert.strictEqual(validations.length, 0, "first: no callback fired"); + + // Second extraction: partial write failure (dropLastN=1) → mismatch=1 + const llm2 = makeLlm([ + { + category: "events", + abstract: "User attended quarterly business review meeting with the team lead", + overview: "Meeting", + content: "The quarterly business review meeting was attended by the user along with their direct team members to discuss ongoing project status and future planning initiatives.", + }, + { + category: "events", + abstract: "User participated in a formal code review session with constructive feedback", + overview: "Code review", + content: "The user actively participated in a formal code review session where they provided constructive feedback on pull request implementations and discussed architectural decision implications.", + }, + ]); + const store2 = makeStore({ initialCount: 0, dropLastN: 1 }); + const extractor2 = makeExtractor(embedder, llm2, store2); + + const stats2 = await extractor2.extractAndPersist( + "User said: I attended a quarterly meeting and participated in a code review", + "session-multi-2", + { onExtractionValidationFailed(v) { validations.push(v); } } + ); + + // Second extraction: 2 candidates, dropLastN=1 → actual=1, expected=2, mismatch=1 + // Note: validationMismatch is NOT written to stats (removed from public API) + // Only the callback receives the mismatch + assert.strictEqual(validations.length, 1, "second: callback fired once"); + assert.strictEqual(validations[0].sessionKey, "session-multi-2"); + assert.strictEqual(validations[0].mismatch, 1); + }); + +}); diff --git a/test/preference-slots.test.mjs b/test/preference-slots.test.mjs index 1849ef31..4e20fabe 100644 --- a/test/preference-slots.test.mjs +++ b/test/preference-slots.test.mjs @@ -125,6 +125,7 @@ function makeGuardExtractor({ vectorSearchResults, onDedupCalled }) { stored.push(...entries); return entries; }, + async count() { return stored.length; }, }; const embedder = { async embed() { @@ -241,6 +242,7 @@ test("dedup guard: non-preference category -> skips guard, goes to LLM", async ( }, async store() {}, async bulkStore() { return []; }, + async count() { return 0; }, }; const embedder = { async embed() { return [0.1, 0.2, 0.3]; }, diff --git a/test/smart-extractor-batch-embed.test.mjs b/test/smart-extractor-batch-embed.test.mjs index ea0120b0..634f291e 100644 --- a/test/smart-extractor-batch-embed.test.mjs +++ b/test/smart-extractor-batch-embed.test.mjs @@ -99,6 +99,9 @@ function makeStore() { async getById(_id, _scopeFilter) { return null; }, + async count() { + return entries.length; + }, get entries() { return [...entries]; }, diff --git a/test/smart-extractor-bulk-store-edge-cases.test.mjs b/test/smart-extractor-bulk-store-edge-cases.test.mjs index 54153158..2eb5ac50 100644 --- a/test/smart-extractor-bulk-store-edge-cases.test.mjs +++ b/test/smart-extractor-bulk-store-edge-cases.test.mjs @@ -61,6 +61,7 @@ class MockStore { async vectorSearch() { return []; } async getById() { return null; } + async count() { return 0; } } // ============================================================ diff --git a/test/smart-extractor-bulk-store.test.mjs b/test/smart-extractor-bulk-store.test.mjs index cef140c5..b39c06f0 100644 --- a/test/smart-extractor-bulk-store.test.mjs +++ b/test/smart-extractor-bulk-store.test.mjs @@ -66,6 +66,7 @@ class MockStore { async vectorSearch() { return []; } async getById() { return null; } + async count() { return 0; } } // ============================================================ diff --git a/test/smart-extractor-scope-filter.test.mjs b/test/smart-extractor-scope-filter.test.mjs index adef26da..488fb12f 100644 --- a/test/smart-extractor-scope-filter.test.mjs +++ b/test/smart-extractor-scope-filter.test.mjs @@ -13,6 +13,7 @@ function makeExtractor(scopeFilters) { }, async store() {}, async bulkStore() {}, + async count() { return 0; }, }; const embedder = {