diff --git a/package-lock.json b/package-lock.json index ee4ecef2..c5d6f5d2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,7 +14,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "devDependencies": { "commander": "^14.0.0", @@ -430,6 +433,26 @@ "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", "license": "MIT" }, + "node_modules/node-addon-api": { + "version": "8.7.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.7.0.tgz", + "integrity": "sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==", + "license": "MIT", + "engines": { + "node": "^18 || ^20 || >= 21" + } + }, + "node_modules/node-gyp-build": { + "version": "4.8.4", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz", + "integrity": "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==", + "license": "MIT", + "bin": { + "node-gyp-build": "bin.js", + "node-gyp-build-optional": "optional.js", + "node-gyp-build-test": "build-test.js" + } + }, "node_modules/openai": { "version": "6.22.0", "resolved": "https://registry.npmjs.org/openai/-/openai-6.22.0.tgz", @@ -517,6 +540,55 @@ "node": ">=12.17" } }, + "node_modules/tree-sitter": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.25.0.tgz", + "integrity": "sha512-PGZZzFW63eElZJDe/b/R/LbsjDDYJa5UEjLZJB59RQsMX+fo0j54fqBPn1MGKav/QNa0JR0zBiVaikYDWCj5KQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.0", + "node-gyp-build": "^4.8.4" + } + }, + "node_modules/tree-sitter-javascript": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-javascript/-/tree-sitter-javascript-0.25.0.tgz", + "integrity": "sha512-1fCbmzAskZkxcZzN41sFZ2br2iqTYP3tKls1b/HKGNPQUVOpsUxpmGxdN/wMqAk3jYZnYBR1dd/y/0avMeU7dw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.3.1", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, + "node_modules/tree-sitter-python": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/tree-sitter-python/-/tree-sitter-python-0.25.0.tgz", + "integrity": "sha512-eCmJx6zQa35GxaCtQD+wXHOhYqBxEL+bp71W/s3fcDMu06MrtzkVXR437dRrCrbrDbyLuUDJpAgycs7ncngLXw==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "node-addon-api": "^8.5.0", + "node-gyp-build": "^4.8.4" + }, + "peerDependencies": { + "tree-sitter": "^0.25.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { + "optional": true + } + } + }, "node_modules/tslib": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", diff --git a/package.json b/package.json index fbcb9d98..c091d619 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,10 @@ "apache-arrow": "18.1.0", "json5": "^2.2.3", "openai": "^6.21.0", - "proper-lockfile": "^4.1.2" + "proper-lockfile": "^4.1.2", + "tree-sitter": "^0.25.0", + "tree-sitter-javascript": "^0.25.0", + "tree-sitter-python": "^0.25.0" }, "openclaw": { "extensions": [ diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index bdb31ce1..44d48747 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + // Issue #692: AST-based semantic chunking + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, // Issue #629 batch embedding fix diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index a5360a80..689a8ef0 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -47,22 +47,18 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/preference-slots.test.mjs", args: ["--test"] }, { group: "core-regression", runner: "node", file: "test/is-latest-auto-supersede.test.mjs" }, { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] }, - // Issue #598 regression tests { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" }, + { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" }, { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" }, { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" }, - // Issue #629 batch embedding fix { group: "llm-clients-and-auth", runner: "node", file: "test/embedder-ollama-batch-routing.test.mjs" }, - // Issue #665 bulkStore tests { group: "storage-and-schema", runner: "node", file: "test/bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] }, { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] }, - // Issue #680 regression tests { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] }, - // Issue #736 recall governance - isRecallUsed() unit tests + { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" }, { group: "core-regression", runner: "node", file: "test/is-recall-used.test.mjs", args: ["--test"] }, - // Issue #492 agentId validation tests { group: "core-regression", runner: "node", file: "test/agentid-validation.test.mjs", args: ["--test"] }, { group: "core-regression", runner: "node", file: "test/command-reflection-guard.test.mjs", args: ["--test"] }, ]; diff --git a/src/chunker.ts b/src/chunker.ts index 8bb4dee6..a6aca426 100644 --- a/src/chunker.ts +++ b/src/chunker.ts @@ -37,6 +37,8 @@ export interface ChunkerConfig { semanticSplit: boolean; /** Max lines per chunk before we try to split earlier on a line boundary. */ maxLinesPerChunk: number; + /** Use AST-aware splitting for code blocks (default: true). */ + astAwareCodeSplit?: boolean; } // Common embedding context limits (provider/model specific). These are typically @@ -188,6 +190,237 @@ function getCjkRatio(text: string): number { const CJK_CHAR_TOKEN_DIVISOR = 2.5; const CJK_RATIO_THRESHOLD = 0.3; +// ============================================================================ +// AST-aware Code Chunking +// ============================================================================ + +export type CodeLanguage = 'javascript' | 'typescript' | 'python' | 'go' | 'rust'; + +const CODE_LANGUAGE_PATTERNS: Array<{ pattern: RegExp; lang: CodeLanguage }> = [ + // Python: must check before JS (def/class are specific) + { + pattern: /\b(def\s|class\s|import\s|from\s|async\s+def\s|print\()/, + lang: 'python', + }, + // Go: func and package keywords + { + pattern: /\b(func\s|package\s|import\s")/, + lang: 'go', + }, + // Rust: fn/impl/pub are distinct + { + pattern: /\bfn\s|impl\s|pub\s|let\s+mut\s/, + lang: 'rust', + }, + // TypeScript: interface / type alias / : type annotations (check before JS 'function') + { + pattern: /\b(interface\s|type\s+|:\s*(?:string|number|boolean|unknown|never|any|void|object|Error|Promise|Record|Array|Map|Set)\b)/, + lang: 'typescript', + }, + // JavaScript / TypeScript: function, const/let/var, arrow, import/export, class + { + pattern: /\b(function|const\s|let\s|var\s|=>|import\s|export\s|class\s)/, + lang: 'javascript', + }, +]; + +/** + * Detect if text is code and return the language, or null if not code. + * Uses only the first 200 chars to avoid being misled by comments. + */ +export function detectCodeLanguage(text: string): CodeLanguage | null { + const sample = text.slice(0, 400); + for (const { pattern, lang } of CODE_LANGUAGE_PATTERNS) { + if (pattern.test(sample)) return lang; + } + return null; +} + +// Supported top-level declaration node types per language +const JS_DECLARATION_TYPES = new Set([ + 'function_declaration', + 'class_declaration', + 'method_definition', + 'arrow_function', + 'export_statement', + 'export_default_declaration', + 'interface_declaration', + 'type_alias_declaration', + 'lexical_declaration', // const/let declarations + 'variable_declaration', +]); + +const PYTHON_DECLARATION_TYPES = new Set([ + 'function_definition', + 'class_definition', + 'decorated_definition', +]); + +function isDeclarationNode(node: { type: string }, lang: CodeLanguage): boolean { + if (lang === 'javascript' || lang === 'typescript') { + return JS_DECLARATION_TYPES.has(node.type); + } + if (lang === 'python') return PYTHON_DECLARATION_TYPES.has(node.type); + return false; +} + +/** + * Sub-split an oversized declaration at the statement level. + * Falls back to chunkDocument for the sub-split logic. + */ +function subChunk(text: string, config: ChunkerConfig): ChunkResult { + // For now, fall back to the character-based chunker within an oversized declaration. + // This preserves the existing behavior for sub-chunks while ensuring top-level + // declarations (functions/classes) are kept intact. + return chunkDocument(text, config); +} + +/** + * AST-aware chunker for code. Parses the code with tree-sitter and splits + * on top-level declaration boundaries (function, class, etc.) instead of + * arbitrary character positions. + * + * NOTE: This function is synchronous to match the sync signature of smartChunk. + * tree-sitter is loaded via require() with a try-catch fallback. + */ +export function astChunk( + code: string, + language: CodeLanguage, + config: ChunkerConfig +): ChunkResult { + // Attempt to load tree-sitter and language grammars + let LanguageMap: Record; + // tree-sitter exports Parser as the default export (module.exports = Parser) + // eslint-disable-next-line @typescript-eslint/no-var-requires + let TreeSitterParser: any; + + try { + TreeSitterParser = require('tree-sitter'); + + if (language === 'javascript' || language === 'typescript') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const JavaScript = require('tree-sitter-javascript'); + LanguageMap = { javascript: JavaScript, typescript: JavaScript }; + } else if (language === 'python') { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const Python = require('tree-sitter-python'); + LanguageMap = { python: Python }; + } else { + // Unsupported language — fall back + return chunkDocument(code, config); + } + } catch { + // tree-sitter not installed — fall back to character-based chunking + return chunkDocument(code, config); + } + + const parser = new TreeSitterParser(); + const chunks: string[] = []; + const metadatas: ChunkMetadata[] = []; + + // Set language on the parser + let languageSet = false; + for (const [, langModule] of Object.entries(LanguageMap)) { + try { + parser.setLanguage(langModule); + languageSet = true; + break; + } catch { + // try next language + } + } + + if (!languageSet) { + return chunkDocument(code, config); + } + + let tree: any; + try { + tree = parser.parse(code); + } catch { + return chunkDocument(code, config); + } + + const root = tree.rootNode; + + // If there are ERROR nodes at the top level, the language parser likely does not + // support this syntax (e.g., TypeScript interface parsed by tree-sitter-javascript). + // Fall back to chunkDocument to avoid producing broken/incomplete chunks. + const hasErrorNodes = root.children.some(c => c.type === 'ERROR'); + if (hasErrorNodes) { + return chunkDocument(code, config); + } + + // Collect non-declaration content (comments, imports, etc.) that would otherwise be lost. + // These are prepended to the next declaration chunk to preserve no-content-left-behind semantics. + let pendingNonDecl = ''; + + // Walk top-level children + for (const child of root.children) { + // Skip non-named nodes and ERROR nodes + if (!child.type || child.type === 'ERROR') continue; + + if (!isDeclarationNode(child, language)) { + // Collect non-declaration content (comments, imports, exports, etc.) + const text = code.slice(child.startIndex, child.endIndex); + if (text.length > 0) { + pendingNonDecl += (pendingNonDecl.length > 0 ? '\n' : '') + text; + } + continue; + } + + const text = code.slice(child.startIndex, child.endIndex); + + if (text.length === 0) continue; + + // Prepend any pending non-declaration content to this declaration chunk + const fullText = pendingNonDecl.length > 0 ? pendingNonDecl + '\n' + text : text; + pendingNonDecl = ''; // reset + + if (fullText.length <= config.maxChunkSize) { + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } else { + // Oversized declaration with prepended content. + // We accept that this chunk may exceed maxChunkSize — splitting + // mid-declaration would break { } balance (Issue #692). + // Sub-splitting at statement level is Phase 2 work. + chunks.push(fullText); + metadatas.push({ + startIndex: child.startIndex, + endIndex: child.endIndex, + length: fullText.length, + }); + } + } + + // If there is trailing non-declaration content (e.g., trailing comments with no following decl), + // emit it as its own chunk (fall back to chunkDocument to handle sizing). + if (pendingNonDecl.length > 0) { + const trailing = chunkDocument(pendingNonDecl, config); + for (let i = 0; i < trailing.chunks.length; i++) { + chunks.push(trailing.chunks[i]); + metadatas.push(trailing.metadatas[i]); + } + } + + // If we got nothing (e.g. empty file, parse error), fall back + if (chunks.length === 0) { + return chunkDocument(code, config); + } + + return { + chunks, + metadatas, + totalOriginalLength: code.length, + chunkCount: chunks.length, + }; +} + // ============================================================================ // Chunking Core // ============================================================================ @@ -276,8 +509,17 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult { minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)), semanticSplit: true, maxLinesPerChunk: 50, + astAwareCodeSplit: true, }; + // AST-aware code path: only activate when explicitly enabled + if (config.astAwareCodeSplit === true) { + const lang = detectCodeLanguage(text); + if (lang !== null) { + return astChunk(text, lang, config); + } + } + return chunkDocument(text, config); } diff --git a/test/ast-code-chunking.test.mjs b/test/ast-code-chunking.test.mjs new file mode 100644 index 00000000..6d0fa2ca --- /dev/null +++ b/test/ast-code-chunking.test.mjs @@ -0,0 +1,296 @@ +/** + * AST-aware Code Chunking Tests (Issue #692) + * + * Verifies that code declarations (functions, classes) are NOT split mid- + * declaration, which was breaking { } balance when the old character-based + * splitter cut through the middle of a function body. + */ + +import { describe, it, mock, beforeEach } from 'node:test'; +import assert from 'node:assert/strict'; +import jitiFactory from 'jiti'; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { detectCodeLanguage, astChunk, smartChunk, chunkDocument, DEFAULT_CHUNKER_CONFIG } = jiti('../src/chunker.ts'); + +// ============================================================================ +// detectCodeLanguage +// ============================================================================ + +describe('detectCodeLanguage', () => { + it('detects JavaScript function', () => { + const code = 'async function handleUserLogin(userId, password) {'; + assert.equal(detectCodeLanguage(code), 'javascript'); + }); + + it('detects TypeScript interface', () => { + const code = 'interface UserProfile { name: string; age: number; }'; + assert.equal(detectCodeLanguage(code), 'typescript'); + }); + + it('detects Python function', () => { + const code = 'def verify_password(password: str, hashed: bytes) -> bool:'; + assert.equal(detectCodeLanguage(code), 'python'); + }); + + it('detects Go function', () => { + const code = 'func handleLogin(w http.ResponseWriter, r *http.Request) {'; + assert.equal(detectCodeLanguage(code), 'go'); + }); + + it('detects Rust function', () => { + const code = 'fn verify_password(password: &str, hash: &str) -> bool {'; + assert.equal(detectCodeLanguage(code), 'rust'); + }); + + it('returns null for plain text', () => { + const text = 'This is a plain English sentence with no code markers.'; + assert.equal(detectCodeLanguage(text), null); + }); + + it('returns null for Markdown prose', () => { + const md = '# Heading\n\nThis is a paragraph with **bold** text.'; + assert.equal(detectCodeLanguage(md), null); + }); + + it('uses only first 400 chars to avoid comment noise', () => { + // Short comment so 'function' appears within first 400 chars of the sample + const commentLine = '// This is a comment\n'; // 20 chars + const code = commentLine.repeat(15) + 'function foo() {}'; // ~300 + function + assert.equal(detectCodeLanguage(code), 'javascript'); + }); +}); + +// ============================================================================ +// Brace balance helper +// ============================================================================ + +/** Count net open braces inside a string. */ +function braceDelta(s) { + let d = 0; + for (const ch of s) { + if (ch === '{') d++; + else if (ch === '}') d--; + } + return d; +} + +/** Check all chunks are brace-balanced. */ +function assertBraceBalanced(chunks, label) { + const deltas = chunks.map(c => braceDelta(c)); + const total = deltas.reduce((a, b) => a + b, 0); + assert.equal(total, 0, `${label}: unbalanced braces across chunks (net=${total}, deltas=${JSON.stringify(deltas)})`); + for (let i = 0; i < deltas.length; i++) { + assert(deltas[i] >= 0, + `${label}: chunk[${i}] closes more braces than it opens (delta=${deltas[i]})`); + } +} + +// ============================================================================ +// Issue #692 — core destructive cases +// ============================================================================ + +describe('Issue #692: code functions must not be split mid-declaration', () => { + + it('verifies that a simple async function is kept whole', () => { + const code = `async function verifyPassword(password, hash) { + const match = await bcrypt.compare(password, hash); + return match; +}`; + + // Very small maxChunkSize to force splitting — old splitter would cut mid-function + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 60, minChunkSize: 10, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Function body should not be split mid-declaration + const splitInsideFunction = result.chunks.some(chunk => { + // Should not have "{" without corresponding "}" + const d = braceDelta(chunk); + return d > 0; // opens braces but never closes + }); + assert.ok(!splitInsideFunction, 'Should not split inside a function declaration'); + assertBraceBalanced(result.chunks, 'verifyPassword'); + }); + + it('verifies that a long function is NOT split mid-function (maxChunkSize < function length)', () => { + // This function is ~250 chars — set maxChunkSize=120 to force the issue. + // Oversized functions are kept as ONE atomic chunk (no mid-function split). + const code = `async function handleUserLogin(userId, password) { + const user = await db.users.findOne({ id: userId }); + if (!user) throw new Error('User not found'); + const match = await bcrypt.compare(password, user.hash); + return match; +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 120, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'handleUserLogin'); + // Should be 1 chunk — entire function kept intact + assert.ok(result.chunks.length === 1, `Expected 1 chunk (entire function), got ${result.chunks.length}`); + }); + + it('verifies that multiple small functions are each kept whole', () => { + const code = `async function verifyPassword(password, hash) { + return await bcrypt.compare(password, hash); +} + +async function hashPassword(password) { + return await bcrypt.hash(password, 10); +} + +export async function createUser(name, email, password) { + const hash = await hashPassword(password); + return await db.users.create({ name, email, hash }); +}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 150, minChunkSize: 40, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + assertBraceBalanced(result.chunks, 'multiple functions'); + // All three functions should appear intact in some chunk + assert.ok(result.chunks.some(c => c.includes('function verifyPassword')), 'verifyPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function hashPassword')), 'hashPassword missing'); + assert.ok(result.chunks.some(c => c.includes('function createUser')), 'createUser missing'); + }); + + it('smartChunk: entire JavaScript file with functions stays brace-balanced', () => { + const code = `const SPEC = { + name: 'auth', + version: '1.0.0', +}; + +async function login(email, password) { + const user = await db.findUser(email); + const ok = await bcrypt.compare(password, user.hash); + if (!ok) throw new Error('Invalid credentials'); + return { token: signToken(user.id) }; +} + +async function logout(token) { + invalidateToken(token); +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk JS'); + }); + + it('smartChunk: Python function stays syntactically coherent', () => { + const code = `def verify_password(password: str, hashed: bytes) -> bool: + return pwd_context.verify(password, hashed) + +def hash_password(password: str) -> str: + return pwd_context.hash(password)`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + // Python chunks should contain complete function definitions + assert.ok(result.chunks.every(c => c.trim().length > 0), 'No empty chunks'); + }); +}); + +// ============================================================================ +// astChunk — fallback & edge cases +// ============================================================================ + +describe('astChunk fallback behavior', () => { + + it('falls back to chunkDocument when tree-sitter throws', () => { + // Pass an empty string to force parse error + const code = ''; + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'javascript', config); + // Should return a valid ChunkResult (fallback path) + assert.ok('chunks' in result); + assert.ok('chunkCount' in result); + }); + + it('returns chunkDocument result when language is unsupported', () => { + const code = 'fn main() {}'; // not JS/TS/Python + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 }; + const result = astChunk(code, 'rust', config); + // Rust is not yet supported in astChunk — falls back + assert.ok('chunks' in result); + }); + + it('handles an oversized single declaration as one atomic chunk (brace-balanced)', () => { + // A very long function that exceeds maxChunkSize — should stay as ONE chunk + const body = ' return x + y;\n'.repeat(200); + const code = `function processData(x, y) {\n${body}}`; + + const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 200, minChunkSize: 50, semanticSplit: false }; + const result = astChunk(code, 'javascript', config); + + // Should be 1 chunk — entire function kept as one + assert.ok(result.chunks.length === 1, `Expected 1 chunk, got ${result.chunks.length}`); + assertBraceBalanced(result.chunks, 'oversized function atomic chunk'); + }); +}); + +// ============================================================================ +// smartChunk — non-code text unchanged +// ============================================================================ + +describe('smartChunk preserves non-code behavior', () => { + + it('passes plain English text to chunkDocument (not astChunk)', () => { + const text = 'This is a plain English paragraph. It has sentences. They end with periods. '.repeat(30); + + const result = smartChunk(text, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + // Plain text should be split on sentence boundaries (semanticSplit=true default) + }); + + it('passes Markdown prose to chunkDocument', () => { + const md = '# Title\n\nThis is a paragraph.\n\n## Section\n\nAnother paragraph here.\n'.repeat(20); + + const result = smartChunk(md, 'text-embedding-3-small'); + + assert.ok(result.chunks.length >= 1, 'Should produce chunks'); + assert.equal(detectCodeLanguage(md), null, 'Markdown should not be detected as code'); + }); +}); + +// ============================================================================ +// TypeScript interface chunking +// ============================================================================ + +describe('TypeScript interfaces and types', () => { + + it('smartChunk: TypeScript interface stays balanced (via smartChunk, not direct astChunk)', () => { + // Note: tree-sitter-javascript cannot fully parse TS interface declarations as one unit. + // When astChunk falls back to chunkDocument for an oversized TS interface, + // it may produce multiple chunks. smartChunk avoids this by using a large + // enough maxChunkSize that the whole interface fits in one chunk. + const code = `interface UserProfile { + id: string; + name: string; + email: string; + createdAt: Date; + metadata?: Record; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + // The interface declaration should produce at least one chunk + assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk'); + assertBraceBalanced(result.chunks, 'smartChunk TS interface'); + assert.ok(result.chunks.some(c => c.includes('interface UserProfile')), 'interface should be present'); + }); + + it('smartChunk on TypeScript stays balanced', () => { + const code = `type UserID = string; + +interface Config { + apiKey: string; + timeout: number; +} + +function getConfig(): Config { + return { apiKey: process.env.KEY, timeout: 5000 }; +}`; + + const result = smartChunk(code, 'text-embedding-3-small'); + assertBraceBalanced(result.chunks, 'smartChunk TS'); + }); +});