From 0be5d002865397ca8c8d62f20a8d7a2527904861 Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 00:27:07 +0800 Subject: [PATCH 1/6] feat(chunker): Phase 1 AST-based semantic chunking for JS/TS/Python (issue #692) Adds tree-sitter-based chunking to prevent splitting code mid-function. Changes: - detectCodeLanguage(): identify JS/TS/Python/Go/Rust from code content - astChunk(): split code at declaration boundaries (function/class/method) - smartChunk(): route through astChunk when file is detected as code - 20 unit tests covering all destructive split scenarios Test: node --test test/ast-code-chunking.test.mjs (20/20 pass) Config: astAwareCodeSplit defaults to true --- docs/issue-692-ast-chunking-design.md | 256 ++++++++++++++++++++++ package-lock.json | 74 ++++++- package.json | 5 +- scripts/ci-test-manifest.mjs | 2 + src/chunker.ts | 242 +++++++++++++++++++++ test/ast-code-chunking.test.mjs | 296 ++++++++++++++++++++++++++ 6 files changed, 873 insertions(+), 2 deletions(-) create mode 100644 docs/issue-692-ast-chunking-design.md create mode 100644 test/ast-code-chunking.test.mjs diff --git a/docs/issue-692-ast-chunking-design.md b/docs/issue-692-ast-chunking-design.md new file mode 100644 index 00000000..11653348 --- /dev/null +++ b/docs/issue-692-ast-chunking-design.md @@ -0,0 +1,256 @@ +# Issue #692 — AST-based Semantic Chunking for Code Blocks + +**Status:** Designed +**Repo:** `memory-lancedb-pro` +**Created:** 2026-05-04 +**Source:** https://github.com/CortexReach/memory-lancedb-pro/issues/692 + +--- + +## Problem Summary + +`chunker.ts` 的 `smartChunk()` 使用純 character-based split,split 邏輯在 `findSplitEnd()`: +- 先找 sentence ending(`.!!?`) +- 找不到 → 找 `\n` +- 找不到 → 找 whitespace + +這對自然語言有效,但對程式碼是災難。JS/TS 函式結尾是 `;` 和 `}`,兩者都不在 target set,導致 function declaration 在 `{` / `}` 之間被隨機切斷。 + +**真實破壞案例:** +``` +Chunk A(~3800字): +"async function handleUserLogin(userId: string, credentials: LoginCredentials): Promise {\n" +" const user = await this.userRepository.findById(userId);\n" +" if (!user) {\n" +" return { success: false, error: 'USER_NOT_FOUND' };" + +Chunk B(~900字): +" }\n" +" const passwordValid = await this.verifyPassword(...);" // verifyPassword 跨 Chunk A 和 B +``` + +**問題:** +- Chunk A 結尾在 `return { success: false, error: 'USER_NOT_FOUND' };` — 不完整的 if-block +- Chunk B 開頭是 `}` — 脫離語境的 closing brace +- `verifyPassword` 函式定義被切成兩段 + +--- + +## Verified Facts (gitnexus + source reading) + +### Call Graph (gitnexus verified) +``` +smartChunk (chunker.ts:263-281) + ├─ calls: getCjkRatio (174-183), chunkDocument (194-255) + │ + └─ called by: + ├─ embedSingle (embedder.ts) + ├─ embedMany (embedder.ts) + ├─ testCjkAwareChunkSizing (test/cjk-recursion-regression.test.mjs) + └─ testSmallContextChunking (test/cjk-recursion-regression.test.mjs) + +chunkDocument (chunker.ts:194-255) + ├─ calls: findSplitEnd (97-143), sliceTrimWithIndices (146-163) + └─ called by: smartChunk + +findSplitEnd (chunker.ts:97-143) ← 問題根因所在 +``` + +### Existing Coverage +- **測試:** 只有 `test/cjk-recursion-regression.test.mjs` 呼叫 `smartChunk`,**沒有任何專門測試 chunker 破壞案例的測試檔案** +- **依賴:** 無 tree-sitter +- **Config:** `maxChunkSize`, `overlapSize`, `minChunkSize`, `semanticSplit`, `maxLinesPerChunk` + +--- + +## Solution: astChunk() + +### Architecture + +``` +smartChunk(text) + ├─ detectCodeLanguage(text) === null → chunkDocument() [現有 character split] + └─ detectCodeLanguage(text) === 'js'/'ts' → astChunk(text, lang, config) + === 'py' → astChunk(text, 'python', config) + === 其他 → chunkDocument() [fallback] +``` + +### 1. `detectCodeLanguage(text) → CodeLanguage | null` + +取前 200 字做偵測: + +| 語言 | Pattern | +|------|---------| +| JS/TS | `/\b(function\|const\s\|let\s\|var\s\|=>\|import\s\|export\s\|interface\s\|type\s\|class\s)/` | +| Python | `/\bdef\s\|class\s\|import\s\|from\s\|print\(/` | +| Go | `/\bfunc\s\|package\s\|import\s"/` | +| Rust | `/\bfn\s\|impl\s\|pub\s\|let\s+mut\s/` | + +### 2. `astChunk(code, language, config) → ChunkResult` + +```typescript +import Parser from 'tree-sitter'; +import JavaScript from 'tree-sitter-javascript'; + +export function astChunk( + code: string, + language: CodeLanguage, + config: ChunkerConfig +): ChunkResult { + const parser = new Parser(); + switch (language) { + case 'javascript': + case 'typescript': + parser.setLanguage(JavaScript); + break; + case 'python': + parser.setLanguage(Python); + break; + default: + return chunkDocument(code, config); + } + + const tree = parser.parse(code); + const chunks: string[] = []; + const metadatas: ChunkMetadata[] = []; + + // Walk top-level nodes + const root = tree.rootNode; + for (const child of root.children) { + if (!isDeclarationNode(child)) continue; + const text = code.slice(child.startIndex, child.endIndex); + if (text.length <= config.maxChunkSize) { + chunks.push(text); + metadatas.push({ startIndex: child.startIndex, endIndex: child.endIndex, length: text.length }); + } else { + // Sub-split within this declaration at statement level + const subResult = subChunk(text, config); + chunks.push(...subResult.chunks); + metadatas.push(...subResult.metadatas); + } + } + + return { chunks, metadatas, totalOriginalLength: code.length, chunkCount: chunks.length }; +} +``` + +### 3. Supported Node Types (Phase 1) + +| 語言 | P0 節點 | +|------|---------| +| JS/TS | `function_declaration`, `arrow_function`, `class_declaration`, `method_definition`, `export_statement`, `interface_declaration`, `type_alias_declaration`, `lexical_declaration` | +| Python | `function_definition`, `class_definition`, `decorated_definition` | +| Go | `function_declaration`, `method_declaration` (P2) | +| Rust | `function_item`, `impl_item` (P2) | + +### 4. Config Extension + +```typescript +interface ChunkerConfig { + // ... 現有五個欄位 ... + astAwareCodeSplit?: boolean; // NEW: default true +} +``` + +### 5. Dependency Changes + +```json +{ + "dependencies": { + "tree-sitter": "^0.21.1", + "tree-sitter-javascript": "^0.21.0", + "tree-sitter-python": "^0.21.0" + } +} +``` + +--- + +## Files to Change + +| 檔案 | 變更 | +|------|------| +| `src/chunker.ts` | + `detectCodeLanguage()`, + `astChunk()`, + `subChunk()`, 修改 `smartChunk()` 路由, + `astAwareCodeSplit` config | +| `src/chunker.test.ts` | **全新建立**(從破壞案例反轉)| +| `package.json` | + tree-sitter, tree-sitter-javascript, tree-sitter-python | + +--- + +## Tests (New File) + +```typescript +describe('AST-aware code chunking', () => { + it('should keep { and } balanced in every chunk', () => { + const code = `async function handleUserLogin(userId: string) { + const user = await this.userRepository.findById(userId); + if (!user) { return { success: false }; } + const session = await this.createSession(user); + return { success: true, session }; +} +async function verifyPassword(input: string): Promise { + return bcrypt.compare(input, this.hash); +}`; + const result = smartChunk(code, 'jina-embeddings-v5'); + for (const chunk of result.chunks) { + const opens = (chunk.match(/{/g) || []).length; + const closes = (chunk.match(/}/g) || []).length; + expect(opens).toBe(closes); + } + }); + + it('should not split function mid-body', () => { + const result = smartChunk(code, 'jina-embeddings-v5'); + const hasMiddleOfFunction = result.chunks.some(c => + c.startsWith('}') || c.endsWith('{') + ); + expect(hasMiddleOfFunction).toBe(false); + }); + + it('should keep complete function as one chunk', () => { + const result = smartChunk(code, 'jina-embeddings-v5'); + const verifyFn = result.chunks.find(c => c.includes('verifyPassword')); + expect(verifyFn).toBeDefined(); + expect(verifyFn).toContain('bcrypt.compare'); + expect(verifyFn).not.toContain('handleUserLogin'); + }); +}); +``` + +--- + +## Phase Plan + +``` +Phase 1(P0 — MVP): + ├─ detectCodeLanguage()(JS/TS/Python) + ├─ astChunk() — JS/TS only + ├─ astChunk() — Python + ├─ Unit tests(破壞案例 → 通過案例) + └─ Config: astAwareCodeSplit default = true + +Phase 2(P1): + ├─ Sub-split within oversized declarations(statement level) + ├─ Go、Rust support + └─ Benchmark: 向量品質 vs. character split + +Phase 3(P2): + └─ Embedding quality evaluation(問答對比) +``` + +--- + +## Q&A + +| Q | A | +|---|---| +| tree-sitter 值得嗎? | **值得**。~1MB runtime,sub-ms parse,能處理巢狀結構/decorator/subclass,比 regex 精準一個數量級。 | +| 預設開? | **預設開**。破壞案例太明確,等使用者手動開等於功能永遠不被用。`astAwareCodeSplit: false` 保留給需要復現舊行為的測試。 | +| 非主流語言? | **Phase 1 fallback**。現有 sentence-ending split 對自然語言有效;非主流語言佔比低,Phase 1 fallback 合理。 | + +--- + +## Reference + +- Issue: https://github.com/CortexReach/memory-lancedb-pro/issues/692 +- Reference impl: `zilliztech/claude-context` ast-splitter.ts +- Existing chunker: `src/chunker.ts` (284 lines) diff --git a/package-lock.json b/package-lock.json index ee4ecef2..c5d6f5d2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,7 +14,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "devDependencies": { "commander": "^14.0.0", @@ -430,6 +433,26 @@ "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", "license": "MIT" }, + "node_modules/node-addon-api": { + "version": "8.7.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.7.0.tgz", + "integrity": "sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==", + "license": "MIT", + "engines": { + "node": "^18 || ^20 || >= 21" + } + }, + "node_modules/node-gyp-build": { + "version": "4.8.4", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz", + "integrity": "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==", + "license": "MIT", + "bin": { + "node-gyp-build": "bin.js", + "node-gyp-build-optional": "optional.js", + "node-gyp-build-test": "build-test.js" + } + }, "node_modules/openai": { "version": "6.22.0", "resolved": "https://registry.npmjs.org/openai/-/openai-6.22.0.tgz", @@ -517,6 +540,55 @@ "node": ">=12.17" } }, + "node_modules/tree-sitter": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.25.0.tgz", + "integrity": "sha512-PGZZzFW63eElZJDe/b/R/LbsjDDYJa5UEjLZJB59RQsMX+fo0j54fqBPn1MGKav/QNa0JR0zBiVaikYDWCj5KQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.0", + "node-gyp-build": "^4.8.4" + } + }, + "node_modules/tree-sitter-javascript": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-javascript/-/tree-sitter-javascript-0.25.0.tgz", + "integrity": "sha512-1fCbmzAskZkxcZzN41sFZ2br2iqTYP3tKls1b/HKGNPQUVOpsUxpmGxdN/wMqAk3jYZnYBR1dd/y/0avMeU7dw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.1", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, + "node_modules/tree-sitter-python": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-python/-/tree-sitter-python-0.25.0.tgz", + "integrity": "sha512-eCmJx6zQa35GxaCtQD+wXHOhYqBxEL+bp71W/s3fcDMu06MrtzkVXR437dRrCrbrDbyLuUDJpAgycs7ncngLXw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.5.0", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, "node_modules/tslib": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", diff --git a/package.json b/package.json index fbcb9d98..c091d619 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "openclaw": { "extensions": [ diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index fc6435dc..3c283735 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + // Issue #692: AST-based semantic chunking + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix diff --git a/src/chunker.ts b/src/chunker.ts index 8bb4dee6..a6aca426 100644 --- a/src/chunker.ts +++ b/src/chunker.ts @@ -37,6 +37,8 @@ export interface ChunkerConfig { semanticSplit: boolean; /** Max lines per chunk before we try to split earlier on a line boundary. */ maxLinesPerChunk: number; + /** Use AST-aware splitting for code blocks (default: true). */ + astAwareCodeSplit?: boolean; } // Common embedding context limits (provider/model specific). These are typically @@ -188,6 +190,237 @@ function getCjkRatio(text: string): number { const CJK_CHAR_TOKEN_DIVISOR = 2.5; const CJK_RATIO_THRESHOLD = 0.3; +// ============================================================================ +// AST-aware Code Chunking +// ============================================================================ + +export type CodeLanguage = 'javascript' | 'typescript' | 'python' | 'go' | 'rust'; + +const CODE_LANGUAGE_PATTERNS: Array<{ pattern: RegExp; lang: CodeLanguage }> = [ + // Python: must check before JS (def/class are specific) + { + pattern: /\b(def\s|class\s|import\s|from\s|async\s+def\s|print\()/, + lang: 'python', + }, + // Go: func and package keywords + { + pattern: /\b(func\s|package\s|import\s")/, + lang: 'go', + }, + // Rust: fn/impl/pub are distinct + { + pattern: /\bfn\s|impl\s|pub\s|let\s+mut\s/, + lang: 'rust', + }, + // TypeScript: interface / type alias / : type annotations (check before JS 'function') + { + pattern: /\b(interface\s|type\s+|:\s*(?:string|number|boolean|unknown|never|any|void|object|Error|Promise|Record|Array|Map|Set)\b)/, + lang: 'typescript', + }, + // JavaScript / TypeScript: function, const/let/var, arrow, import/export, class + { + pattern: /\b(function|const\s|let\s|var\s|=>|import\s|export\s|class\s)/, + lang: 'javascript', + }, +]; + +/** + * Detect if text is code and return the language, or null if not code. + * Uses only the first 200 chars to avoid being misled by comments. + */ +export function detectCodeLanguage(text: string): CodeLanguage | null { + const sample = text.slice(0, 400); + for (const { pattern, lang } of CODE_LANGUAGE_PATTERNS) { + if (pattern.test(sample)) return lang; + } + return null; +} + +// Supported top-level declaration node types per language +const JS_DECLARATION_TYPES = new Set([ + 'function_declaration', + 'class_declaration', + 'method_definition', + 'arrow_function', + 'export_statement', + 'export_default_declaration', + 'interface_declaration', + 'type_alias_declaration', + 'lexical_declaration', // const/let declarations + 'variable_declaration', +]); + +const PYTHON_DECLARATION_TYPES = new Set([ + 'function_definition', + 'class_definition', + 'decorated_definition', +]); + +function isDeclarationNode(node: { type: string }, lang: CodeLanguage): boolean { + if (lang === 'javascript' || lang === 'typescript') { + return JS_DECLARATION_TYPES.has(node.type); + } + if (lang === 'python') return PYTHON_DECLARATION_TYPES.has(node.type); + return false; +} + +/** + * Sub-split an oversized declaration at the statement level. + * Falls back to chunkDocument for the sub-split logic. + */ +function subChunk(text: string, config: ChunkerConfig): ChunkResult { + // For now, fall back to the character-based chunker within an oversized declaration. + // This preserves the existing behavior for sub-chunks while ensuring top-level + // declarations (functions/classes) are kept intact. + return chunkDocument(text, config); +} + +/** + * AST-aware chunker for code. Parses the code with tree-sitter and splits + * on top-level declaration boundaries (function, class, etc.) instead of + * arbitrary character positions. + * + * NOTE: This function is synchronous to match the sync signature of smartChunk. + * tree-sitter is loaded via require() with a try-catch fallback. + */ +export function astChunk( + code: string, + language: CodeLanguage, + config: ChunkerConfig +): ChunkResult { + // Attempt to load tree-sitter and language grammars + let LanguageMap: Record; + // tree-sitter exports Parser as the default export (module.exports = Parser) + // eslint-disable-next-line @typescript-eslint/no-var-requires + let TreeSitterParser: any; + + try { + TreeSitterParser = require('tree-sitter'); + + if (language === 'javascript' || language === 'typescript') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const JavaScript = require('tree-sitter-javascript'); + LanguageMap = { javascript: JavaScript, typescript: JavaScript }; + } else if (language === 'python') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const Python = require('tree-sitter-python'); + LanguageMap = { python: Python }; + } else { + // Unsupported language — fall back + return chunkDocument(code, config); + } + } catch { + // tree-sitter not installed — fall back to character-based chunking + return chunkDocument(code, config); + } + + const parser = new TreeSitterParser(); + const chunks: string[] = []; + const metadatas: ChunkMetadata[] = []; + + // Set language on the parser + let languageSet = false; + for (const [, langModule] of Object.entries(LanguageMap)) { + try { + parser.setLanguage(langModule); + languageSet = true; + break; + } catch { + // try next language + } + } + + if (!languageSet) { + return chunkDocument(code, config); + } + + let tree: any; + try { + tree = parser.parse(code); + } catch { + return chunkDocument(code, config); + } + + const root = tree.rootNode; + + // If there are ERROR nodes at the top level, the language parser likely does not + // support this syntax (e.g., TypeScript interface parsed by tree-sitter-javascript). + // Fall back to chunkDocument to avoid producing broken/incomplete chunks. + const hasErrorNodes = root.children.some(c => c.type === 'ERROR'); + if (hasErrorNodes) { + return chunkDocument(code, config); + } + + // Collect non-declaration content (comments, imports, etc.) that would otherwise be lost. + // These are prepended to the next declaration chunk to preserve no-content-left-behind semantics. + let pendingNonDecl = ''; + + // Walk top-level children + for (const child of root.children) { + // Skip non-named nodes and ERROR nodes + if (!child.type || child.type === 'ERROR') continue; + + if (!isDeclarationNode(child, language)) { + // Collect non-declaration content (comments, imports, exports, etc.) + const text = code.slice(child.startIndex, child.endIndex); + if (text.length > 0) { + pendingNonDecl += (pendingNonDecl.length > 0 ? '\n' : '') + text; + } + continue; + } + + const text = code.slice(child.startIndex, child.endIndex); + + if (text.length === 0) continue; + + // Prepend any pending non-declaration content to this declaration chunk + const fullText = pendingNonDecl.length > 0 ? pendingNonDecl + '\n' + text : text; + pendingNonDecl = ''; // reset + + if (fullText.length <= config.maxChunkSize) { + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } else { + // Oversized declaration with prepended content. + // We accept that this chunk may exceed maxChunkSize — splitting + // mid-declaration would break { } balance (Issue #692). + // Sub-splitting at statement level is Phase 2 work. + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } + } + + // If there is trailing non-declaration content (e.g., trailing comments with no following decl), + // emit it as its own chunk (fall back to chunkDocument to handle sizing). + if (pendingNonDecl.length > 0) { + const trailing = chunkDocument(pendingNonDecl, config); + for (let i = 0; i < trailing.chunks.length; i++) { + chunks.push(trailing.chunks[i]); + metadatas.push(trailing.metadatas[i]); + } + } + + // If we got nothing (e.g. empty file, parse error), fall back + if (chunks.length === 0) { + return chunkDocument(code, config); + } + + return { + chunks, + metadatas, + totalOriginalLength: code.length, + chunkCount: chunks.length, + }; +} + // ============================================================================ // Chunking Core // ============================================================================ @@ -276,8 +509,17 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult { minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)), semanticSplit: true, maxLinesPerChunk: 50, + astAwareCodeSplit: true, }; + // AST-aware code path: only activate when explicitly enabled + if (config.astAwareCodeSplit === true) { + const lang = detectCodeLanguage(text); + if (lang !== null) { + return astChunk(text, lang, config); + } + } + return chunkDocument(text, config); } diff --git a/test/ast-code-chunking.test.mjs b/test/ast-code-chunking.test.mjs new file mode 100644 index 00000000..6d0fa2ca --- /dev/null +++ b/test/ast-code-chunking.test.mjs @@ -0,0 +1,296 @@ +/** + * AST-aware Code Chunking Tests (Issue #692) + * + * Verifies that code declarations (functions, classes) are NOT split mid- + * declaration, which was breaking { } balance when the old character-based + * splitter cut through the middle of a function body. + */ + +import { describe, it, mock, beforeEach } from 'node:test'; +import assert from 'node:assert/strict'; +import jitiFactory from 'jiti'; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { detectCodeLanguage, astChunk, smartChunk, chunkDocument, DEFAULT_CHUNKER_CONFIG } = jiti('../src/chunker.ts'); + +// ============================================================================ +// detectCodeLanguage +// ============================================================================ + +describe('detectCodeLanguage', () => { + it('detects JavaScript function', () => { + const code = 'async function handleUserLogin(userId, password) {'; + assert.equal(detectCodeLanguage(code), 'javascript'); + }); + + it('detects TypeScript interface', () => { + const code = 'interface UserProfile { name: string; age: number; }'; + assert.equal(detectCodeLanguage(code), 'typescript'); + }); + + it('detects Python function', () => { + const code = 'def verify_password(password: str, hashed: bytes) -> bool:'; + assert.equal(detectCodeLanguage(code), 'python'); + }); + + it('detects Go function', () => { + const code = 'func handleLogin(w http.ResponseWriter, r *http.Request) {'; + assert.equal(detectCodeLanguage(code), 'go'); + }); + + it('detects Rust function', () => { + const code = 'fn verify_password(password: &str, hash: &str) -> bool {'; + assert.equal(detectCodeLanguage(code), 'rust'); + }); + + it('returns null for plain text', () => { + const text = 'This is a plain English sentence with no code markers.'; + assert.equal(detectCodeLanguage(text), null); + }); + + it('returns null for Markdown prose', () => { + const md = '# Heading\n\nThis is a paragraph with **bold** text.'; + assert.equal(detectCodeLanguage(md), null); + }); + + it('uses only first 400 chars to avoid comment noise', () => { + // Short comment so 'function' appears within first 400 chars of the sample + const commentLine = '// This is a comment\n'; // 20 chars + const code = commentLine.repeat(15) + 'function foo() {}'; // ~300 + function + assert.equal(detectCodeLanguage(code), 'javascript'); + }); +}); + +// ============================================================================ +// Brace balance helper +// ============================================================================ + +/** Count net open braces inside a string. */ +function braceDelta(s) { + let d = 0; + for (const ch of s) { + if (ch === '{') d++; + else if (ch === '}') d--; + } + return d; +} + +/** Check all chunks are brace-balanced. */ +function assertBraceBalanced(chunks, label) { + const deltas = chunks.map(c => braceDelta(c)); + const total = deltas.reduce((a, b) => a + b, 0); + assert.equal(total, 0, `${label}: unbalanced braces across chunks (net=${total}, deltas=${JSON.stringify(deltas)})`); + for (let i = 0; i < deltas.length; i++) { + assert(deltas[i] >= 0, + `${label}: chunk[${i}] closes more braces than it opens (delta=${deltas[i]})`); + } +} + +// ============================================================================ +// Issue #692 — core destructive cases +// ============================================================================ + +describe('Issue #692: code functions must not be split mid-declaration', () => { + + it('verifies that a simple async function is kept whole', () => { + const code = `async function verifyPassword(password, hash) { + const match = await bcrypt.compare(password, hash); + return match; +}`; + + // Very small maxChunkSize to force splitting — old splitter would cut mid-function + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 60, minChunkSize: 10, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Function body should not be split mid-declaration + const splitInsideFunction = result.chunks.some(chunk => { + // Should not have "{" without corresponding "}" + const d = braceDelta(chunk); + return d > 0; // opens braces but never closes + }); + assert.ok(!splitInsideFunction, 'Should not split inside a function declaration'); + assertBraceBalanced(result.chunks, 'verifyPassword'); + }); + + it('verifies that a long function is NOT split mid-function (maxChunkSize < function length)', () => { + // This function is ~250 chars — set maxChunkSize=120 to force the issue. + // Oversized functions are kept as ONE atomic chunk (no mid-function split). + const code = `async function handleUserLogin(userId, password) { + const user = await db.users.findOne({ id: userId }); + if (!user) throw new Error('User not found'); + const match = await bcrypt.compare(password, user.hash); + return match; +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 120, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'handleUserLogin'); + // Should be 1 chunk — entire function kept intact + assert.ok(result.chunks.length === 1, `Expected 1 chunk (entire function), got ${result.chunks.length}`); + }); + + it('verifies that multiple small functions are each kept whole', () => { + const code = `async function verifyPassword(password, hash) { + return await bcrypt.compare(password, hash); +} + +async function hashPassword(password) { + return await bcrypt.hash(password, 10); +} + +export async function createUser(name, email, password) { + const hash = await hashPassword(password); + return await db.users.create({ name, email, hash }); +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 150, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'multiple functions'); + // All three functions should appear intact in some chunk + assert.ok(result.chunks.some(c => c.includes('function verifyPassword')), 'verifyPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function hashPassword')), 'hashPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function createUser')), 'createUser missing'); + }); + + it('smartChunk: entire JavaScript file with functions stays brace-balanced', () => { + const code = `const SPEC = { + name: 'auth', + version: '1.0.0', +}; + +async function login(email, password) { + const user = await db.findUser(email); + const ok = await bcrypt.compare(password, user.hash); + if (!ok) throw new Error('Invalid credentials'); + return { token: signToken(user.id) }; +} + +async function logout(token) { + invalidateToken(token); +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk JS'); + }); + + it('smartChunk: Python function stays syntactically coherent', () => { + const code = `def verify_password(password: str, hashed: bytes) -> bool: + return pwd_context.verify(password, hashed) + +def hash_password(password: str) -> str: + return pwd_context.hash(password)`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + // Python chunks should contain complete function definitions + assert.ok(result.chunks.every(c => c.trim().length > 0), 'No empty chunks'); + }); +}); + +// ============================================================================ +// astChunk — fallback & edge cases +// ============================================================================ + +describe('astChunk fallback behavior', () => { + + it('falls back to chunkDocument when tree-sitter throws', () => { + // Pass an empty string to force parse error + const code = ''; + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'javascript', config); + // Should return a valid ChunkResult (fallback path) + assert.ok('chunks' in result); + assert.ok('chunkCount' in result); + }); + + it('returns chunkDocument result when language is unsupported', () => { + const code = 'fn main() {}'; // not JS/TS/Python + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'rust', config); + // Rust is not yet supported in astChunk — falls back + assert.ok('chunks' in result); + }); + + it('handles an oversized single declaration as one atomic chunk (brace-balanced)', () => { + // A very long function that exceeds maxChunkSize — should stay as ONE chunk + const body = ' return x + y;\n'.repeat(200); + const code = `function processData(x, y) {\n${body}}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 200, minChunkSize: 50, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Should be 1 chunk — entire function kept as one + assert.ok(result.chunks.length === 1, `Expected 1 chunk, got ${result.chunks.length}`); + assertBraceBalanced(result.chunks, 'oversized function atomic chunk'); + }); +}); + +// ============================================================================ +// smartChunk — non-code text unchanged +// ============================================================================ + +describe('smartChunk preserves non-code behavior', () => { + + it('passes plain English text to chunkDocument (not astChunk)', () => { + const text = 'This is a plain English paragraph. It has sentences. They end with periods. '.repeat(30); + + const result = smartChunk(text, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + // Plain text should be split on sentence boundaries (semanticSplit=true default) + }); + + it('passes Markdown prose to chunkDocument', () => { + const md = '# Title\n\nThis is a paragraph.\n\n## Section\n\nAnother paragraph here.\n'.repeat(20); + + const result = smartChunk(md, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + assert.equal(detectCodeLanguage(md), null, 'Markdown should not be detected as code'); + }); +}); + +// ============================================================================ +// TypeScript interface chunking +// ============================================================================ + +describe('TypeScript interfaces and types', () => { + + it('smartChunk: TypeScript interface stays balanced (via smartChunk, not direct astChunk)', () => { + // Note: tree-sitter-javascript cannot fully parse TS interface declarations as one unit. + // When astChunk falls back to chunkDocument for an oversized TS interface, + // it may produce multiple chunks. smartChunk avoids this by using a large + // enough maxChunkSize that the whole interface fits in one chunk. + const code = `interface UserProfile { + id: string; + name: string; + email: string; + createdAt: Date; + metadata?: Record; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + // The interface declaration should produce at least one chunk + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + assertBraceBalanced(result.chunks, 'smartChunk TS interface'); + assert.ok(result.chunks.some(c => c.includes('interface UserProfile')), 'interface should be present'); + }); + + it('smartChunk on TypeScript stays balanced', () => { + const code = `type UserID = string; + +interface Config { + apiKey: string; + timeout: number; +} + +function getConfig(): Config { + return { apiKey: process.env.KEY, timeout: 5000 }; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk TS'); + }); +}); From 6b40ba74b970ef1c964ebfddb0676ca28b7e8f16 Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 19:53:54 +0800 Subject: [PATCH 2/6] fix(ci): register ast-code-chunking.test.mjs in verify baseline (issue #692) --- scripts/verify-ci-test-manifest.mjs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index fee475c3..1ba71a47 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -49,6 +49,8 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + // Issue #692: AST-based semantic chunking + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix From 96f7eee3e958f28361bf3adfe52ff6d10b813818 Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 20:25:48 +0800 Subject: [PATCH 3/6] fix(ci): add issue606 to verify baseline to match manifest (PR #713) --- scripts/verify-ci-test-manifest.mjs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index 1ba71a47..b2f28fed 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -60,6 +60,8 @@ const EXPECTED_BASELINE = [ { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] }, + // Issue #606 SDK migration Bug 2 regression tests + { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" }, // Issue #680 regression tests { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] }, // Issue #492 agentId validation tests From b1b28bfdd0ab5f840ccb84a95fb3cb1df1047610 Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 20:34:57 +0800 Subject: [PATCH 4/6] ci: retrigger From ea8181839c80fd9bfc05450458759aebd5711b22 Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 20:52:59 +0800 Subject: [PATCH 5/6] fix(ci): remove ast-code-chunking from baseline, add issue606 to manifest (PR #713) --- scripts/ci-test-manifest.mjs | 2 -- scripts/verify-ci-test-manifest.mjs | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index 44d48747..bdb31ce1 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -48,8 +48,6 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, - // Issue #692: AST-based semantic chunking - { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index 79ba6040..8577fa3f 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -49,8 +49,6 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, - // Issue #692: AST-based semantic chunking - { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix @@ -60,10 +58,10 @@ const EXPECTED_BASELINE = [ { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] }, - // Issue #606 SDK migration Bug 2 regression tests - { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" }, // Issue #680 regression tests { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] }, + // Issue #606 SDK migration Bug 2 regression tests + { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" }, // Issue #736 recall governance - isRecallUsed() unit tests { group: "core-regression", runner: "node", file: "test/is-recall-used.test.mjs", args: ["--test"] }, // Issue #492 agentId validation tests From 3c603d6c3c925fd2bf8b6dad5161e203cb9b32bf Mon Sep 17 00:00:00 2001 From: James Lin Date: Tue, 5 May 2026 20:54:01 +0800 Subject: [PATCH 6/6] fix(ci): add ast-code-chunking + issue606 to manifest, rebuild verify baseline (merge sync) --- docs/issue-692-ast-chunking-design.md | 256 -------------------------- scripts/ci-test-manifest.mjs | 2 + scripts/verify-ci-test-manifest.mjs | 8 +- 3 files changed, 3 insertions(+), 263 deletions(-) delete mode 100644 docs/issue-692-ast-chunking-design.md diff --git a/docs/issue-692-ast-chunking-design.md b/docs/issue-692-ast-chunking-design.md deleted file mode 100644 index 11653348..00000000 --- a/docs/issue-692-ast-chunking-design.md +++ /dev/null @@ -1,256 +0,0 @@ -# Issue #692 — AST-based Semantic Chunking for Code Blocks - -**Status:** Designed -**Repo:** `memory-lancedb-pro` -**Created:** 2026-05-04 -**Source:** https://github.com/CortexReach/memory-lancedb-pro/issues/692 - ---- - -## Problem Summary - -`chunker.ts` 的 `smartChunk()` 使用純 character-based split,split 邏輯在 `findSplitEnd()`: -- 先找 sentence ending(`.!!?`) -- 找不到 → 找 `\n` -- 找不到 → 找 whitespace - -這對自然語言有效,但對程式碼是災難。JS/TS 函式結尾是 `;` 和 `}`,兩者都不在 target set,導致 function declaration 在 `{` / `}` 之間被隨機切斷。 - -**真實破壞案例:** -``` -Chunk A(~3800字): -"async function handleUserLogin(userId: string, credentials: LoginCredentials): Promise {\n" -" const user = await this.userRepository.findById(userId);\n" -" if (!user) {\n" -" return { success: false, error: 'USER_NOT_FOUND' };" - -Chunk B(~900字): -" }\n" -" const passwordValid = await this.verifyPassword(...);" // verifyPassword 跨 Chunk A 和 B -``` - -**問題:** -- Chunk A 結尾在 `return { success: false, error: 'USER_NOT_FOUND' };` — 不完整的 if-block -- Chunk B 開頭是 `}` — 脫離語境的 closing brace -- `verifyPassword` 函式定義被切成兩段 - ---- - -## Verified Facts (gitnexus + source reading) - -### Call Graph (gitnexus verified) -``` -smartChunk (chunker.ts:263-281) - ├─ calls: getCjkRatio (174-183), chunkDocument (194-255) - │ - └─ called by: - ├─ embedSingle (embedder.ts) - ├─ embedMany (embedder.ts) - ├─ testCjkAwareChunkSizing (test/cjk-recursion-regression.test.mjs) - └─ testSmallContextChunking (test/cjk-recursion-regression.test.mjs) - -chunkDocument (chunker.ts:194-255) - ├─ calls: findSplitEnd (97-143), sliceTrimWithIndices (146-163) - └─ called by: smartChunk - -findSplitEnd (chunker.ts:97-143) ← 問題根因所在 -``` - -### Existing Coverage -- **測試:** 只有 `test/cjk-recursion-regression.test.mjs` 呼叫 `smartChunk`,**沒有任何專門測試 chunker 破壞案例的測試檔案** -- **依賴:** 無 tree-sitter -- **Config:** `maxChunkSize`, `overlapSize`, `minChunkSize`, `semanticSplit`, `maxLinesPerChunk` - ---- - -## Solution: astChunk() - -### Architecture - -``` -smartChunk(text) - ├─ detectCodeLanguage(text) === null → chunkDocument() [現有 character split] - └─ detectCodeLanguage(text) === 'js'/'ts' → astChunk(text, lang, config) - === 'py' → astChunk(text, 'python', config) - === 其他 → chunkDocument() [fallback] -``` - -### 1. `detectCodeLanguage(text) → CodeLanguage | null` - -取前 200 字做偵測: - -| 語言 | Pattern | -|------|---------| -| JS/TS | `/\b(function\|const\s\|let\s\|var\s\|=>\|import\s\|export\s\|interface\s\|type\s\|class\s)/` | -| Python | `/\bdef\s\|class\s\|import\s\|from\s\|print\(/` | -| Go | `/\bfunc\s\|package\s\|import\s"/` | -| Rust | `/\bfn\s\|impl\s\|pub\s\|let\s+mut\s/` | - -### 2. `astChunk(code, language, config) → ChunkResult` - -```typescript -import Parser from 'tree-sitter'; -import JavaScript from 'tree-sitter-javascript'; - -export function astChunk( - code: string, - language: CodeLanguage, - config: ChunkerConfig -): ChunkResult { - const parser = new Parser(); - switch (language) { - case 'javascript': - case 'typescript': - parser.setLanguage(JavaScript); - break; - case 'python': - parser.setLanguage(Python); - break; - default: - return chunkDocument(code, config); - } - - const tree = parser.parse(code); - const chunks: string[] = []; - const metadatas: ChunkMetadata[] = []; - - // Walk top-level nodes - const root = tree.rootNode; - for (const child of root.children) { - if (!isDeclarationNode(child)) continue; - const text = code.slice(child.startIndex, child.endIndex); - if (text.length <= config.maxChunkSize) { - chunks.push(text); - metadatas.push({ startIndex: child.startIndex, endIndex: child.endIndex, length: text.length }); - } else { - // Sub-split within this declaration at statement level - const subResult = subChunk(text, config); - chunks.push(...subResult.chunks); - metadatas.push(...subResult.metadatas); - } - } - - return { chunks, metadatas, totalOriginalLength: code.length, chunkCount: chunks.length }; -} -``` - -### 3. Supported Node Types (Phase 1) - -| 語言 | P0 節點 | -|------|---------| -| JS/TS | `function_declaration`, `arrow_function`, `class_declaration`, `method_definition`, `export_statement`, `interface_declaration`, `type_alias_declaration`, `lexical_declaration` | -| Python | `function_definition`, `class_definition`, `decorated_definition` | -| Go | `function_declaration`, `method_declaration` (P2) | -| Rust | `function_item`, `impl_item` (P2) | - -### 4. Config Extension - -```typescript -interface ChunkerConfig { - // ... 現有五個欄位 ... - astAwareCodeSplit?: boolean; // NEW: default true -} -``` - -### 5. Dependency Changes - -```json -{ - "dependencies": { - "tree-sitter": "^0.21.1", - "tree-sitter-javascript": "^0.21.0", - "tree-sitter-python": "^0.21.0" - } -} -``` - ---- - -## Files to Change - -| 檔案 | 變更 | -|------|------| -| `src/chunker.ts` | + `detectCodeLanguage()`, + `astChunk()`, + `subChunk()`, 修改 `smartChunk()` 路由, + `astAwareCodeSplit` config | -| `src/chunker.test.ts` | **全新建立**(從破壞案例反轉)| -| `package.json` | + tree-sitter, tree-sitter-javascript, tree-sitter-python | - ---- - -## Tests (New File) - -```typescript -describe('AST-aware code chunking', () => { - it('should keep { and } balanced in every chunk', () => { - const code = `async function handleUserLogin(userId: string) { - const user = await this.userRepository.findById(userId); - if (!user) { return { success: false }; } - const session = await this.createSession(user); - return { success: true, session }; -} -async function verifyPassword(input: string): Promise { - return bcrypt.compare(input, this.hash); -}`; - const result = smartChunk(code, 'jina-embeddings-v5'); - for (const chunk of result.chunks) { - const opens = (chunk.match(/{/g) || []).length; - const closes = (chunk.match(/}/g) || []).length; - expect(opens).toBe(closes); - } - }); - - it('should not split function mid-body', () => { - const result = smartChunk(code, 'jina-embeddings-v5'); - const hasMiddleOfFunction = result.chunks.some(c => - c.startsWith('}') || c.endsWith('{') - ); - expect(hasMiddleOfFunction).toBe(false); - }); - - it('should keep complete function as one chunk', () => { - const result = smartChunk(code, 'jina-embeddings-v5'); - const verifyFn = result.chunks.find(c => c.includes('verifyPassword')); - expect(verifyFn).toBeDefined(); - expect(verifyFn).toContain('bcrypt.compare'); - expect(verifyFn).not.toContain('handleUserLogin'); - }); -}); -``` - ---- - -## Phase Plan - -``` -Phase 1(P0 — MVP): - ├─ detectCodeLanguage()(JS/TS/Python) - ├─ astChunk() — JS/TS only - ├─ astChunk() — Python - ├─ Unit tests(破壞案例 → 通過案例) - └─ Config: astAwareCodeSplit default = true - -Phase 2(P1): - ├─ Sub-split within oversized declarations(statement level) - ├─ Go、Rust support - └─ Benchmark: 向量品質 vs. character split - -Phase 3(P2): - └─ Embedding quality evaluation(問答對比) -``` - ---- - -## Q&A - -| Q | A | -|---|---| -| tree-sitter 值得嗎? | **值得**。~1MB runtime,sub-ms parse,能處理巢狀結構/decorator/subclass,比 regex 精準一個數量級。 | -| 預設開? | **預設開**。破壞案例太明確,等使用者手動開等於功能永遠不被用。`astAwareCodeSplit: false` 保留給需要復現舊行為的測試。 | -| 非主流語言? | **Phase 1 fallback**。現有 sentence-ending split 對自然語言有效;非主流語言佔比低,Phase 1 fallback 合理。 | - ---- - -## Reference - -- Issue: https://github.com/CortexReach/memory-lancedb-pro/issues/692 -- Reference impl: `zilliztech/claude-context` ast-splitter.ts -- Existing chunker: `src/chunker.ts` (284 lines) diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index bdb31ce1..44d48747 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + // Issue #692: AST-based semantic chunking + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index 8577fa3f..689a8ef0 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -47,24 +47,18 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/preference-slots.test.mjs", args: ["--test"] }, { group: "core-regression", runner: "node", file: "test/is-latest-auto-supersede.test.mjs" }, { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, - // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, - // Issue #629 batch embedding fix { group: "llm-clients-and-auth", runner: "node", file: "test/embedder-ollama-batch-routing.test.mjs" }, - // Issue #665 bulkStore tests { group: "storage-and-schema", runner: "node", file: "test/bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] }, - // Issue #680 regression tests { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] }, - // Issue #606 SDK migration Bug 2 regression tests { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" }, - // Issue #736 recall governance - isRecallUsed() unit tests { group: "core-regression", runner: "node", file: "test/is-recall-used.test.mjs", args: ["--test"] }, - // Issue #492 agentId validation tests { group: "core-regression", runner: "node", file: "test/agentid-validation.test.mjs", args: ["--test"] }, { group: "core-regression", runner: "node", file: "test/command-reflection-guard.test.mjs", args: ["--test"] }, ];