From 0be5d002865397ca8c8d62f20a8d7a2527904861 Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 00:27:07 +0800
Subject: [PATCH 1/6] feat(chunker): Phase 1 AST-based semantic chunking for
 JS/TS/Python (issue #692)

Adds tree-sitter-based chunking to prevent splitting code mid-function.

Changes:
- detectCodeLanguage(): identify JS/TS/Python/Go/Rust from code content
- astChunk(): split code at declaration boundaries (function/class/method)
- smartChunk(): route through astChunk when file is detected as code
- 20 unit tests covering all destructive split scenarios

Test: node --test test/ast-code-chunking.test.mjs (20/20 pass)
Config: astAwareCodeSplit defaults to true
---
 docs/issue-692-ast-chunking-design.md | 256 ++++++++++++++++++++++
 package-lock.json                     |  74 ++++++-
 package.json                          |   5 +-
 scripts/ci-test-manifest.mjs          |   2 +
 src/chunker.ts                        | 242 +++++++++++++++++++++
 test/ast-code-chunking.test.mjs       | 296 ++++++++++++++++++++++++++
 6 files changed, 873 insertions(+), 2 deletions(-)
 create mode 100644 docs/issue-692-ast-chunking-design.md
 create mode 100644 test/ast-code-chunking.test.mjs
diff --git a/docs/issue-692-ast-chunking-design.md b/docs/issue-692-ast-chunking-design.md
new file mode 100644
index 00000000..11653348
--- /dev/null
+++ b/docs/issue-692-ast-chunking-design.md
@@ -0,0 +1,256 @@
+# Issue #692 — AST-based Semantic Chunking for Code Blocks
+
+**Status:** Designed
+**Repo:** `memory-lancedb-pro`
+**Created:** 2026-05-04
+**Source:** https://github.com/CortexReach/memory-lancedb-pro/issues/692
+
+---
+
+## Problem Summary
+
+`chunker.ts` 的 `smartChunk()` 使用純 character-based split，split 邏輯在 `findSplitEnd()`：
+- 先找 sentence ending（`.!！？`）
+- 找不到 → 找 `\n`
+- 找不到 → 找 whitespace
+
+這對自然語言有效，但對程式碼是災難。JS/TS 函式結尾是 `;` 和 `}`，兩者都不在 target set，導致 function declaration 在 `{` / `}` 之間被隨機切斷。
+
+**真實破壞案例：**
+```
+Chunk A（~3800字）：
+"async function handleUserLogin(userId: string, credentials: LoginCredentials): Promise<AuthResult> {\n"
+"    const user = await this.userRepository.findById(userId);\n"
+"    if (!user) {\n"
+"        return { success: false, error: 'USER_NOT_FOUND' };"
+
+Chunk B（~900字）：
+"    }\n"
+"    const passwordValid = await this.verifyPassword(...);"  // verifyPassword 跨 Chunk A 和 B
+```
+
+**問題：**
+- Chunk A 結尾在 `return { success: false, error: 'USER_NOT_FOUND' };` — 不完整的 if-block
+- Chunk B 開頭是 `}` — 脫離語境的 closing brace
+- `verifyPassword` 函式定義被切成兩段
+
+---
+
+## Verified Facts (gitnexus + source reading)
+
+### Call Graph (gitnexus verified)
+```
+smartChunk (chunker.ts:263-281)
+  ├─ calls: getCjkRatio (174-183), chunkDocument (194-255)
+  │
+  └─ called by:
+       ├─ embedSingle (embedder.ts)
+       ├─ embedMany (embedder.ts)
+       ├─ testCjkAwareChunkSizing (test/cjk-recursion-regression.test.mjs)
+       └─ testSmallContextChunking (test/cjk-recursion-regression.test.mjs)
+
+chunkDocument (chunker.ts:194-255)
+  ├─ calls: findSplitEnd (97-143), sliceTrimWithIndices (146-163)
+  └─ called by: smartChunk
+
+findSplitEnd (chunker.ts:97-143)  ← 問題根因所在
+```
+
+### Existing Coverage
+- **測試：** 只有 `test/cjk-recursion-regression.test.mjs` 呼叫 `smartChunk`，**沒有任何專門測試 chunker 破壞案例的測試檔案**
+- **依賴：** 無 tree-sitter
+- **Config:** `maxChunkSize`, `overlapSize`, `minChunkSize`, `semanticSplit`, `maxLinesPerChunk`
+
+---
+
+## Solution: astChunk()
+
+### Architecture
+
+```
+smartChunk(text)
+  ├─ detectCodeLanguage(text) === null  → chunkDocument()  [現有 character split]
+  └─ detectCodeLanguage(text) === 'js'/'ts' → astChunk(text, lang, config)
+                               === 'py'   → astChunk(text, 'python', config)
+                               === 其他   → chunkDocument()  [fallback]
+```
+
+### 1. `detectCodeLanguage(text) → CodeLanguage | null`
+
+取前 200 字做偵測：
+
+| 語言 | Pattern |
+|------|---------|
+| JS/TS | `/\b(function\|const\s\|let\s\|var\s\|=>\|import\s\|export\s\|interface\s\|type\s\|class\s)/` |
+| Python | `/\bdef\s\|class\s\|import\s\|from\s\|print\(/` |
+| Go | `/\bfunc\s\|package\s\|import\s"/` |
+| Rust | `/\bfn\s\|impl\s\|pub\s\|let\s+mut\s/` |
+
+### 2. `astChunk(code, language, config) → ChunkResult`
+
+```typescript
+import Parser from 'tree-sitter';
+import JavaScript from 'tree-sitter-javascript';
+
+export function astChunk(
+  code: string,
+  language: CodeLanguage,
+  config: ChunkerConfig
+): ChunkResult {
+  const parser = new Parser();
+  switch (language) {
+    case 'javascript':
+    case 'typescript':
+      parser.setLanguage(JavaScript);
+      break;
+    case 'python':
+      parser.setLanguage(Python);
+      break;
+    default:
+      return chunkDocument(code, config);
+  }
+
+  const tree = parser.parse(code);
+  const chunks: string[] = [];
+  const metadatas: ChunkMetadata[] = [];
+
+  // Walk top-level nodes
+  const root = tree.rootNode;
+  for (const child of root.children) {
+    if (!isDeclarationNode(child)) continue;
+    const text = code.slice(child.startIndex, child.endIndex);
+    if (text.length <= config.maxChunkSize) {
+      chunks.push(text);
+      metadatas.push({ startIndex: child.startIndex, endIndex: child.endIndex, length: text.length });
+    } else {
+      // Sub-split within this declaration at statement level
+      const subResult = subChunk(text, config);
+      chunks.push(...subResult.chunks);
+      metadatas.push(...subResult.metadatas);
+    }
+  }
+
+  return { chunks, metadatas, totalOriginalLength: code.length, chunkCount: chunks.length };
+}
+```
+
+### 3. Supported Node Types (Phase 1)
+
+| 語言 | P0 節點 |
+|------|---------|
+| JS/TS | `function_declaration`, `arrow_function`, `class_declaration`, `method_definition`, `export_statement`, `interface_declaration`, `type_alias_declaration`, `lexical_declaration` |
+| Python | `function_definition`, `class_definition`, `decorated_definition` |
+| Go | `function_declaration`, `method_declaration` (P2) |
+| Rust | `function_item`, `impl_item` (P2) |
+
+### 4. Config Extension
+
+```typescript
+interface ChunkerConfig {
+  // ... 現有五個欄位 ...
+  astAwareCodeSplit?: boolean;  // NEW: default true
+}
+```
+
+### 5. Dependency Changes
+
+```json
+{
+  "dependencies": {
+    "tree-sitter": "^0.21.1",
+    "tree-sitter-javascript": "^0.21.0",
+    "tree-sitter-python": "^0.21.0"
+  }
+}
+```
+
+---
+
+## Files to Change
+
+| 檔案 | 變更 |
+|------|------|
+| `src/chunker.ts` | + `detectCodeLanguage()`, + `astChunk()`, + `subChunk()`, 修改 `smartChunk()` 路由, + `astAwareCodeSplit` config |
+| `src/chunker.test.ts` | **全新建立**（從破壞案例反轉）|
+| `package.json` | + tree-sitter, tree-sitter-javascript, tree-sitter-python |
+
+---
+
+## Tests (New File)
+
+```typescript
+describe('AST-aware code chunking', () => {
+  it('should keep { and } balanced in every chunk', () => {
+    const code = `async function handleUserLogin(userId: string) {
+    const user = await this.userRepository.findById(userId);
+    if (!user) { return { success: false }; }
+    const session = await this.createSession(user);
+    return { success: true, session };
+}
+async function verifyPassword(input: string): Promise<boolean> {
+    return bcrypt.compare(input, this.hash);
+}`;
+    const result = smartChunk(code, 'jina-embeddings-v5');
+    for (const chunk of result.chunks) {
+      const opens = (chunk.match(/{/g) || []).length;
+      const closes = (chunk.match(/}/g) || []).length;
+      expect(opens).toBe(closes);
+    }
+  });
+
+  it('should not split function mid-body', () => {
+    const result = smartChunk(code, 'jina-embeddings-v5');
+    const hasMiddleOfFunction = result.chunks.some(c =>
+      c.startsWith('}') || c.endsWith('{')
+    );
+    expect(hasMiddleOfFunction).toBe(false);
+  });
+
+  it('should keep complete function as one chunk', () => {
+    const result = smartChunk(code, 'jina-embeddings-v5');
+    const verifyFn = result.chunks.find(c => c.includes('verifyPassword'));
+    expect(verifyFn).toBeDefined();
+    expect(verifyFn).toContain('bcrypt.compare');
+    expect(verifyFn).not.toContain('handleUserLogin');
+  });
+});
+```
+
+---
+
+## Phase Plan
+
+```
+Phase 1（P0 — MVP）：
+  ├─ detectCodeLanguage()（JS/TS/Python）
+  ├─ astChunk() — JS/TS only
+  ├─ astChunk() — Python
+  ├─ Unit tests（破壞案例 → 通過案例）
+  └─ Config: astAwareCodeSplit default = true
+
+Phase 2（P1）：
+  ├─ Sub-split within oversized declarations（statement level）
+  ├─ Go、Rust support
+  └─ Benchmark: 向量品質 vs. character split
+
+Phase 3（P2）：
+  └─ Embedding quality evaluation（問答對比）
+```
+
+---
+
+## Q&A
+
+| Q | A |
+|---|---|
+| tree-sitter 值得嗎？ | **值得**。~1MB runtime，sub-ms parse，能處理巢狀結構/decorator/subclass，比 regex 精準一個數量級。 |
+| 預設開？ | **預設開**。破壞案例太明確，等使用者手動開等於功能永遠不被用。`astAwareCodeSplit: false` 保留給需要復現舊行為的測試。 |
+| 非主流語言？ | **Phase 1 fallback**。現有 sentence-ending split 對自然語言有效；非主流語言佔比低，Phase 1 fallback 合理。 |
+
+---
+
+## Reference
+
+- Issue: https://github.com/CortexReach/memory-lancedb-pro/issues/692
+- Reference impl: `zilliztech/claude-context` ast-splitter.ts
+- Existing chunker: `src/chunker.ts` (284 lines)
diff --git a/package-lock.json b/package-lock.json
index ee4ecef2..c5d6f5d2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -14,7 +14,10 @@
         "apache-arrow": "18.1.0",
         "json5": "^2.2.3",
         "openai": "^6.21.0",
-        "proper-lockfile": "^4.1.2"
+        "proper-lockfile": "^4.1.2",
+        "tree-sitter": "^0.25.0",
+        "tree-sitter-javascript": "^0.25.0",
+        "tree-sitter-python": "^0.25.0"
       },
       "devDependencies": {
         "commander": "^14.0.0",
@@ -430,6 +433,26 @@
       "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==",
       "license": "MIT"
     },
+    "node_modules/node-addon-api": {
+      "version": "8.7.0",
+      "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.7.0.tgz",
+      "integrity": "sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==",
+      "license": "MIT",
+      "engines": {
+        "node": "^18 || ^20 || >= 21"
+      }
+    },
+    "node_modules/node-gyp-build": {
+      "version": "4.8.4",
+      "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz",
+      "integrity": "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==",
+      "license": "MIT",
+      "bin": {
+        "node-gyp-build": "bin.js",
+        "node-gyp-build-optional": "optional.js",
+        "node-gyp-build-test": "build-test.js"
+      }
+    },
     "node_modules/openai": {
       "version": "6.22.0",
       "resolved": "https://registry.npmjs.org/openai/-/openai-6.22.0.tgz",
@@ -517,6 +540,55 @@
         "node": ">=12.17"
       }
     },
+    "node_modules/tree-sitter": {
+      "version": "0.25.0",
+      "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.25.0.tgz",
+      "integrity": "sha512-PGZZzFW63eElZJDe/b/R/LbsjDDYJa5UEjLZJB59RQsMX+fo0j54fqBPn1MGKav/QNa0JR0zBiVaikYDWCj5KQ==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "dependencies": {
+        "node-addon-api": "^8.3.0",
+        "node-gyp-build": "^4.8.4"
+      }
+    },
+    "node_modules/tree-sitter-javascript": {
+      "version": "0.25.0",
+      "resolved": "https://registry.npmjs.org/tree-sitter-javascript/-/tree-sitter-javascript-0.25.0.tgz",
+      "integrity": "sha512-1fCbmzAskZkxcZzN41sFZ2br2iqTYP3tKls1b/HKGNPQUVOpsUxpmGxdN/wMqAk3jYZnYBR1dd/y/0avMeU7dw==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "dependencies": {
+        "node-addon-api": "^8.3.1",
+        "node-gyp-build": "^4.8.4"
+      },
+      "peerDependencies": {
+        "tree-sitter": "^0.25.0"
+      },
+      "peerDependenciesMeta": {
+        "tree-sitter": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/tree-sitter-python": {
+      "version": "0.25.0",
+      "resolved": "https://registry.npmjs.org/tree-sitter-python/-/tree-sitter-python-0.25.0.tgz",
+      "integrity": "sha512-eCmJx6zQa35GxaCtQD+wXHOhYqBxEL+bp71W/s3fcDMu06MrtzkVXR437dRrCrbrDbyLuUDJpAgycs7ncngLXw==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "dependencies": {
+        "node-addon-api": "^8.5.0",
+        "node-gyp-build": "^4.8.4"
+      },
+      "peerDependencies": {
+        "tree-sitter": "^0.25.0"
+      },
+      "peerDependenciesMeta": {
+        "tree-sitter": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/tslib": {
       "version": "2.8.1",
       "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
diff --git a/package.json b/package.json
index fbcb9d98..c091d619 100644
--- a/package.json
+++ b/package.json
@@ -43,7 +43,10 @@
     "apache-arrow": "18.1.0",
     "json5": "^2.2.3",
     "openai": "^6.21.0",
-    "proper-lockfile": "^4.1.2"
+    "proper-lockfile": "^4.1.2",
+    "tree-sitter": "^0.25.0",
+    "tree-sitter-javascript": "^0.25.0",
+    "tree-sitter-python": "^0.25.0"
   },
   "openclaw": {
     "extensions": [
diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs
index fc6435dc..3c283735 100644
--- a/scripts/ci-test-manifest.mjs
+++ b/scripts/ci-test-manifest.mjs
@@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
   // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
+  // Issue #692: AST-based semantic chunking
+  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
   // Issue #629 batch embedding fix
diff --git a/src/chunker.ts b/src/chunker.ts
index 8bb4dee6..a6aca426 100644
--- a/src/chunker.ts
+++ b/src/chunker.ts
@@ -37,6 +37,8 @@ export interface ChunkerConfig {
   semanticSplit: boolean;
   /** Max lines per chunk before we try to split earlier on a line boundary. */
   maxLinesPerChunk: number;
+  /** Use AST-aware splitting for code blocks (default: true). */
+  astAwareCodeSplit?: boolean;
 }
 
 // Common embedding context limits (provider/model specific). These are typically
@@ -188,6 +190,237 @@ function getCjkRatio(text: string): number {
 const CJK_CHAR_TOKEN_DIVISOR = 2.5;
 const CJK_RATIO_THRESHOLD = 0.3;
 
+// ============================================================================
+// AST-aware Code Chunking
+// ============================================================================
+
+export type CodeLanguage = 'javascript' | 'typescript' | 'python' | 'go' | 'rust';
+
+const CODE_LANGUAGE_PATTERNS: Array<{ pattern: RegExp; lang: CodeLanguage }> = [
+  // Python: must check before JS (def/class are specific)
+  {
+    pattern: /\b(def\s|class\s|import\s|from\s|async\s+def\s|print\()/,
+    lang: 'python',
+  },
+  // Go: func and package keywords
+  {
+    pattern: /\b(func\s|package\s|import\s")/,
+    lang: 'go',
+  },
+  // Rust: fn/impl/pub are distinct
+  {
+    pattern: /\bfn\s|impl\s|pub\s|let\s+mut\s/,
+    lang: 'rust',
+  },
+  // TypeScript: interface / type alias / : type annotations (check before JS 'function')
+  {
+    pattern: /\b(interface\s|type\s+|:\s*(?:string|number|boolean|unknown|never|any|void|object|Error|Promise|Record|Array|Map|Set)\b)/,
+    lang: 'typescript',
+  },
+  // JavaScript / TypeScript: function, const/let/var, arrow, import/export, class
+  {
+    pattern: /\b(function|const\s|let\s|var\s|=>|import\s|export\s|class\s)/,
+    lang: 'javascript',
+  },
+];
+
+/**
+ * Detect if text is code and return the language, or null if not code.
+ * Uses only the first 200 chars to avoid being misled by comments.
+ */
+export function detectCodeLanguage(text: string): CodeLanguage | null {
+  const sample = text.slice(0, 400);
+  for (const { pattern, lang } of CODE_LANGUAGE_PATTERNS) {
+    if (pattern.test(sample)) return lang;
+  }
+  return null;
+}
+
+// Supported top-level declaration node types per language
+const JS_DECLARATION_TYPES = new Set([
+  'function_declaration',
+  'class_declaration',
+  'method_definition',
+  'arrow_function',
+  'export_statement',
+  'export_default_declaration',
+  'interface_declaration',
+  'type_alias_declaration',
+  'lexical_declaration', // const/let declarations
+  'variable_declaration',
+]);
+
+const PYTHON_DECLARATION_TYPES = new Set([
+  'function_definition',
+  'class_definition',
+  'decorated_definition',
+]);
+
+function isDeclarationNode(node: { type: string }, lang: CodeLanguage): boolean {
+  if (lang === 'javascript' || lang === 'typescript') {
+    return JS_DECLARATION_TYPES.has(node.type);
+  }
+  if (lang === 'python') return PYTHON_DECLARATION_TYPES.has(node.type);
+  return false;
+}
+
+/**
+ * Sub-split an oversized declaration at the statement level.
+ * Falls back to chunkDocument for the sub-split logic.
+ */
+function subChunk(text: string, config: ChunkerConfig): ChunkResult {
+  // For now, fall back to the character-based chunker within an oversized declaration.
+  // This preserves the existing behavior for sub-chunks while ensuring top-level
+  // declarations (functions/classes) are kept intact.
+  return chunkDocument(text, config);
+}
+
+/**
+ * AST-aware chunker for code. Parses the code with tree-sitter and splits
+ * on top-level declaration boundaries (function, class, etc.) instead of
+ * arbitrary character positions.
+ *
+ * NOTE: This function is synchronous to match the sync signature of smartChunk.
+ * tree-sitter is loaded via require() with a try-catch fallback.
+ */
+export function astChunk(
+  code: string,
+  language: CodeLanguage,
+  config: ChunkerConfig
+): ChunkResult {
+  // Attempt to load tree-sitter and language grammars
+  let LanguageMap: Record<string, any>;
+  // tree-sitter exports Parser as the default export (module.exports = Parser)
+  // eslint-disable-next-line @typescript-eslint/no-var-requires
+  let TreeSitterParser: any;
+
+  try {
+    TreeSitterParser = require('tree-sitter');
+
+    if (language === 'javascript' || language === 'typescript') {
+      // eslint-disable-next-line @typescript-eslint/no-var-requires
+      const JavaScript = require('tree-sitter-javascript');
+      LanguageMap = { javascript: JavaScript, typescript: JavaScript };
+    } else if (language === 'python') {
+      // eslint-disable-next-line @typescript-eslint/no-var-requires
+      const Python = require('tree-sitter-python');
+      LanguageMap = { python: Python };
+    } else {
+      // Unsupported language — fall back
+      return chunkDocument(code, config);
+    }
+  } catch {
+    // tree-sitter not installed — fall back to character-based chunking
+    return chunkDocument(code, config);
+  }
+
+  const parser = new TreeSitterParser();
+  const chunks: string[] = [];
+  const metadatas: ChunkMetadata[] = [];
+
+  // Set language on the parser
+  let languageSet = false;
+  for (const [, langModule] of Object.entries(LanguageMap)) {
+    try {
+      parser.setLanguage(langModule);
+      languageSet = true;
+      break;
+    } catch {
+      // try next language
+    }
+  }
+
+  if (!languageSet) {
+    return chunkDocument(code, config);
+  }
+
+  let tree: any;
+  try {
+    tree = parser.parse(code);
+  } catch {
+    return chunkDocument(code, config);
+  }
+
+  const root = tree.rootNode;
+
+  // If there are ERROR nodes at the top level, the language parser likely does not
+  // support this syntax (e.g., TypeScript interface parsed by tree-sitter-javascript).
+  // Fall back to chunkDocument to avoid producing broken/incomplete chunks.
+  const hasErrorNodes = root.children.some(c => c.type === 'ERROR');
+  if (hasErrorNodes) {
+    return chunkDocument(code, config);
+  }
+
+  // Collect non-declaration content (comments, imports, etc.) that would otherwise be lost.
+  // These are prepended to the next declaration chunk to preserve no-content-left-behind semantics.
+  let pendingNonDecl = '';
+
+  // Walk top-level children
+  for (const child of root.children) {
+    // Skip non-named nodes and ERROR nodes
+    if (!child.type || child.type === 'ERROR') continue;
+
+    if (!isDeclarationNode(child, language)) {
+      // Collect non-declaration content (comments, imports, exports, etc.)
+      const text = code.slice(child.startIndex, child.endIndex);
+      if (text.length > 0) {
+        pendingNonDecl += (pendingNonDecl.length > 0 ? '\n' : '') + text;
+      }
+      continue;
+    }
+
+    const text = code.slice(child.startIndex, child.endIndex);
+
+    if (text.length === 0) continue;
+
+    // Prepend any pending non-declaration content to this declaration chunk
+    const fullText = pendingNonDecl.length > 0 ? pendingNonDecl + '\n' + text : text;
+    pendingNonDecl = ''; // reset
+
+    if (fullText.length <= config.maxChunkSize) {
+      chunks.push(fullText);
+      metadatas.push({
+        startIndex: child.startIndex,
+        endIndex: child.endIndex,
+        length: fullText.length,
+      });
+    } else {
+      // Oversized declaration with prepended content.
+      // We accept that this chunk may exceed maxChunkSize — splitting
+      // mid-declaration would break { } balance (Issue #692).
+      // Sub-splitting at statement level is Phase 2 work.
+      chunks.push(fullText);
+      metadatas.push({
+        startIndex: child.startIndex,
+        endIndex: child.endIndex,
+        length: fullText.length,
+      });
+    }
+  }
+
+  // If there is trailing non-declaration content (e.g., trailing comments with no following decl),
+  // emit it as its own chunk (fall back to chunkDocument to handle sizing).
+  if (pendingNonDecl.length > 0) {
+    const trailing = chunkDocument(pendingNonDecl, config);
+    for (let i = 0; i < trailing.chunks.length; i++) {
+      chunks.push(trailing.chunks[i]);
+      metadatas.push(trailing.metadatas[i]);
+    }
+  }
+
+  // If we got nothing (e.g. empty file, parse error), fall back
+  if (chunks.length === 0) {
+    return chunkDocument(code, config);
+  }
+
+  return {
+    chunks,
+    metadatas,
+    totalOriginalLength: code.length,
+    chunkCount: chunks.length,
+  };
+}
+
 // ============================================================================
 // Chunking Core
 // ============================================================================
@@ -276,8 +509,17 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult {
     minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)),
     semanticSplit: true,
     maxLinesPerChunk: 50,
+    astAwareCodeSplit: true,
   };
 
+  // AST-aware code path: only activate when explicitly enabled
+  if (config.astAwareCodeSplit === true) {
+    const lang = detectCodeLanguage(text);
+    if (lang !== null) {
+      return astChunk(text, lang, config);
+    }
+  }
+
   return chunkDocument(text, config);
 }
 
diff --git a/test/ast-code-chunking.test.mjs b/test/ast-code-chunking.test.mjs
new file mode 100644
index 00000000..6d0fa2ca
--- /dev/null
+++ b/test/ast-code-chunking.test.mjs
@@ -0,0 +1,296 @@
+/**
+ * AST-aware Code Chunking Tests (Issue #692)
+ *
+ * Verifies that code declarations (functions, classes) are NOT split mid-
+ * declaration, which was breaking { } balance when the old character-based
+ * splitter cut through the middle of a function body.
+ */
+
+import { describe, it, mock, beforeEach } from 'node:test';
+import assert from 'node:assert/strict';
+import jitiFactory from 'jiti';
+
+const jiti = jitiFactory(import.meta.url, { interopDefault: true });
+const { detectCodeLanguage, astChunk, smartChunk, chunkDocument, DEFAULT_CHUNKER_CONFIG } = jiti('../src/chunker.ts');
+
+// ============================================================================
+// detectCodeLanguage
+// ============================================================================
+
+describe('detectCodeLanguage', () => {
+  it('detects JavaScript function', () => {
+    const code = 'async function handleUserLogin(userId, password) {';
+    assert.equal(detectCodeLanguage(code), 'javascript');
+  });
+
+  it('detects TypeScript interface', () => {
+    const code = 'interface UserProfile { name: string; age: number; }';
+    assert.equal(detectCodeLanguage(code), 'typescript');
+  });
+
+  it('detects Python function', () => {
+    const code = 'def verify_password(password: str, hashed: bytes) -> bool:';
+    assert.equal(detectCodeLanguage(code), 'python');
+  });
+
+  it('detects Go function', () => {
+    const code = 'func handleLogin(w http.ResponseWriter, r *http.Request) {';
+    assert.equal(detectCodeLanguage(code), 'go');
+  });
+
+  it('detects Rust function', () => {
+    const code = 'fn verify_password(password: &str, hash: &str) -> bool {';
+    assert.equal(detectCodeLanguage(code), 'rust');
+  });
+
+  it('returns null for plain text', () => {
+    const text = 'This is a plain English sentence with no code markers.';
+    assert.equal(detectCodeLanguage(text), null);
+  });
+
+  it('returns null for Markdown prose', () => {
+    const md = '# Heading\n\nThis is a paragraph with **bold** text.';
+    assert.equal(detectCodeLanguage(md), null);
+  });
+
+  it('uses only first 400 chars to avoid comment noise', () => {
+    // Short comment so 'function' appears within first 400 chars of the sample
+    const commentLine = '// This is a comment\n'; // 20 chars
+    const code = commentLine.repeat(15) + 'function foo() {}'; // ~300 + function
+    assert.equal(detectCodeLanguage(code), 'javascript');
+  });
+});
+
+// ============================================================================
+// Brace balance helper
+// ============================================================================
+
+/** Count net open braces inside a string. */
+function braceDelta(s) {
+  let d = 0;
+  for (const ch of s) {
+    if (ch === '{') d++;
+    else if (ch === '}') d--;
+  }
+  return d;
+}
+
+/** Check all chunks are brace-balanced. */
+function assertBraceBalanced(chunks, label) {
+  const deltas = chunks.map(c => braceDelta(c));
+  const total = deltas.reduce((a, b) => a + b, 0);
+  assert.equal(total, 0, `${label}: unbalanced braces across chunks (net=${total}, deltas=${JSON.stringify(deltas)})`);
+  for (let i = 0; i < deltas.length; i++) {
+    assert(deltas[i] >= 0,
+      `${label}: chunk[${i}] closes more braces than it opens (delta=${deltas[i]})`);
+  }
+}
+
+// ============================================================================
+// Issue #692 — core destructive cases
+// ============================================================================
+
+describe('Issue #692: code functions must not be split mid-declaration', () => {
+
+  it('verifies that a simple async function is kept whole', () => {
+    const code = `async function verifyPassword(password, hash) {
+  const match = await bcrypt.compare(password, hash);
+  return match;
+}`;
+
+    // Very small maxChunkSize to force splitting — old splitter would cut mid-function
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 60, minChunkSize: 10, semanticSplit: false };
+    const result = astChunk(code, 'javascript', config);
+
+    // Function body should not be split mid-declaration
+    const splitInsideFunction = result.chunks.some(chunk => {
+      // Should not have "{" without corresponding "}"
+      const d = braceDelta(chunk);
+      return d > 0; // opens braces but never closes
+    });
+    assert.ok(!splitInsideFunction, 'Should not split inside a function declaration');
+    assertBraceBalanced(result.chunks, 'verifyPassword');
+  });
+
+  it('verifies that a long function is NOT split mid-function (maxChunkSize < function length)', () => {
+    // This function is ~250 chars — set maxChunkSize=120 to force the issue.
+    // Oversized functions are kept as ONE atomic chunk (no mid-function split).
+    const code = `async function handleUserLogin(userId, password) {
+  const user = await db.users.findOne({ id: userId });
+  if (!user) throw new Error('User not found');
+  const match = await bcrypt.compare(password, user.hash);
+  return match;
+}`;
+
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 120, minChunkSize: 40, semanticSplit: false };
+    const result = astChunk(code, 'javascript', config);
+
+    assertBraceBalanced(result.chunks, 'handleUserLogin');
+    // Should be 1 chunk — entire function kept intact
+    assert.ok(result.chunks.length === 1, `Expected 1 chunk (entire function), got ${result.chunks.length}`);
+  });
+
+  it('verifies that multiple small functions are each kept whole', () => {
+    const code = `async function verifyPassword(password, hash) {
+  return await bcrypt.compare(password, hash);
+}
+
+async function hashPassword(password) {
+  return await bcrypt.hash(password, 10);
+}
+
+export async function createUser(name, email, password) {
+  const hash = await hashPassword(password);
+  return await db.users.create({ name, email, hash });
+}`;
+
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 150, minChunkSize: 40, semanticSplit: false };
+    const result = astChunk(code, 'javascript', config);
+
+    assertBraceBalanced(result.chunks, 'multiple functions');
+    // All three functions should appear intact in some chunk
+    assert.ok(result.chunks.some(c => c.includes('function verifyPassword')), 'verifyPassword missing');
+    assert.ok(result.chunks.some(c => c.includes('function hashPassword')), 'hashPassword missing');
+    assert.ok(result.chunks.some(c => c.includes('function createUser')), 'createUser missing');
+  });
+
+  it('smartChunk: entire JavaScript file with functions stays brace-balanced', () => {
+    const code = `const SPEC = {
+  name: 'auth',
+  version: '1.0.0',
+};
+
+async function login(email, password) {
+  const user = await db.findUser(email);
+  const ok = await bcrypt.compare(password, user.hash);
+  if (!ok) throw new Error('Invalid credentials');
+  return { token: signToken(user.id) };
+}
+
+async function logout(token) {
+  invalidateToken(token);
+}`;
+
+    const result = smartChunk(code, 'text-embedding-3-small');
+    assertBraceBalanced(result.chunks, 'smartChunk JS');
+  });
+
+  it('smartChunk: Python function stays syntactically coherent', () => {
+    const code = `def verify_password(password: str, hashed: bytes) -> bool:
+    return pwd_context.verify(password, hashed)
+
+def hash_password(password: str) -> str:
+    return pwd_context.hash(password)`;
+
+    const result = smartChunk(code, 'text-embedding-3-small');
+    assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk');
+    // Python chunks should contain complete function definitions
+    assert.ok(result.chunks.every(c => c.trim().length > 0), 'No empty chunks');
+  });
+});
+
+// ============================================================================
+// astChunk — fallback & edge cases
+// ============================================================================
+
+describe('astChunk fallback behavior', () => {
+
+  it('falls back to chunkDocument when tree-sitter throws', () => {
+    // Pass an empty string to force parse error
+    const code = '';
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 };
+    const result = astChunk(code, 'javascript', config);
+    // Should return a valid ChunkResult (fallback path)
+    assert.ok('chunks' in result);
+    assert.ok('chunkCount' in result);
+  });
+
+  it('returns chunkDocument result when language is unsupported', () => {
+    const code = 'fn main() {}'; // not JS/TS/Python
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 50 };
+    const result = astChunk(code, 'rust', config);
+    // Rust is not yet supported in astChunk — falls back
+    assert.ok('chunks' in result);
+  });
+
+  it('handles an oversized single declaration as one atomic chunk (brace-balanced)', () => {
+    // A very long function that exceeds maxChunkSize — should stay as ONE chunk
+    const body = '  return x + y;\n'.repeat(200);
+    const code = `function processData(x, y) {\n${body}}`;
+
+    const config = { ...DEFAULT_CHUNKER_CONFIG, maxChunkSize: 200, minChunkSize: 50, semanticSplit: false };
+    const result = astChunk(code, 'javascript', config);
+
+    // Should be 1 chunk — entire function kept as one
+    assert.ok(result.chunks.length === 1, `Expected 1 chunk, got ${result.chunks.length}`);
+    assertBraceBalanced(result.chunks, 'oversized function atomic chunk');
+  });
+});
+
+// ============================================================================
+// smartChunk — non-code text unchanged
+// ============================================================================
+
+describe('smartChunk preserves non-code behavior', () => {
+
+  it('passes plain English text to chunkDocument (not astChunk)', () => {
+    const text = 'This is a plain English paragraph. It has sentences. They end with periods. '.repeat(30);
+
+    const result = smartChunk(text, 'text-embedding-3-small');
+
+    assert.ok(result.chunks.length >= 1, 'Should produce chunks');
+    // Plain text should be split on sentence boundaries (semanticSplit=true default)
+  });
+
+  it('passes Markdown prose to chunkDocument', () => {
+    const md = '# Title\n\nThis is a paragraph.\n\n## Section\n\nAnother paragraph here.\n'.repeat(20);
+
+    const result = smartChunk(md, 'text-embedding-3-small');
+
+    assert.ok(result.chunks.length >= 1, 'Should produce chunks');
+    assert.equal(detectCodeLanguage(md), null, 'Markdown should not be detected as code');
+  });
+});
+
+// ============================================================================
+// TypeScript interface chunking
+// ============================================================================
+
+describe('TypeScript interfaces and types', () => {
+
+  it('smartChunk: TypeScript interface stays balanced (via smartChunk, not direct astChunk)', () => {
+    // Note: tree-sitter-javascript cannot fully parse TS interface declarations as one unit.
+    // When astChunk falls back to chunkDocument for an oversized TS interface,
+    // it may produce multiple chunks. smartChunk avoids this by using a large
+    // enough maxChunkSize that the whole interface fits in one chunk.
+    const code = `interface UserProfile {
+  id: string;
+  name: string;
+  email: string;
+  createdAt: Date;
+  metadata?: Record<string, unknown>;
+}`;
+
+    const result = smartChunk(code, 'text-embedding-3-small');
+    // The interface declaration should produce at least one chunk
+    assert.ok(result.chunks.length >= 1, 'Should produce at least one chunk');
+    assertBraceBalanced(result.chunks, 'smartChunk TS interface');
+    assert.ok(result.chunks.some(c => c.includes('interface UserProfile')), 'interface should be present');
+  });
+
+  it('smartChunk on TypeScript stays balanced', () => {
+    const code = `type UserID = string;
+
+interface Config {
+  apiKey: string;
+  timeout: number;
+}
+
+function getConfig(): Config {
+  return { apiKey: process.env.KEY, timeout: 5000 };
+}`;
+
+    const result = smartChunk(code, 'text-embedding-3-small');
+    assertBraceBalanced(result.chunks, 'smartChunk TS');
+  });
+});

From 6b40ba74b970ef1c964ebfddb0676ca28b7e8f16 Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 19:53:54 +0800
Subject: [PATCH 2/6] fix(ci): register ast-code-chunking.test.mjs in verify
 baseline (issue #692)

---
 scripts/verify-ci-test-manifest.mjs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs
index fee475c3..1ba71a47 100644
--- a/scripts/verify-ci-test-manifest.mjs
+++ b/scripts/verify-ci-test-manifest.mjs
@@ -49,6 +49,8 @@ const EXPECTED_BASELINE = [
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
   // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
+  // Issue #692: AST-based semantic chunking
+  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
   // Issue #629 batch embedding fix

From 96f7eee3e958f28361bf3adfe52ff6d10b813818 Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 20:25:48 +0800
Subject: [PATCH 3/6] fix(ci): add issue606 to verify baseline to match
 manifest (PR #713)

---
 scripts/verify-ci-test-manifest.mjs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs
index 1ba71a47..b2f28fed 100644
--- a/scripts/verify-ci-test-manifest.mjs
+++ b/scripts/verify-ci-test-manifest.mjs
@@ -60,6 +60,8 @@ const EXPECTED_BASELINE = [
   { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] },
+  // Issue #606 SDK migration Bug 2 regression tests
+  { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" },
   // Issue #680 regression tests
   { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] },
   // Issue #492 agentId validation tests

From b1b28bfdd0ab5f840ccb84a95fb3cb1df1047610 Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 20:34:57 +0800
Subject: [PATCH 4/6] ci: retrigger


From ea8181839c80fd9bfc05450458759aebd5711b22 Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 20:52:59 +0800
Subject: [PATCH 5/6] fix(ci): remove ast-code-chunking from baseline, add
 issue606 to manifest (PR #713)

---
 scripts/ci-test-manifest.mjs        | 2 --
 scripts/verify-ci-test-manifest.mjs | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs
index 44d48747..bdb31ce1 100644
--- a/scripts/ci-test-manifest.mjs
+++ b/scripts/ci-test-manifest.mjs
@@ -48,8 +48,6 @@ export const CI_TEST_MANIFEST = [
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
   // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
-  // Issue #692: AST-based semantic chunking
-  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
   // Issue #629 batch embedding fix
diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs
index 79ba6040..8577fa3f 100644
--- a/scripts/verify-ci-test-manifest.mjs
+++ b/scripts/verify-ci-test-manifest.mjs
@@ -49,8 +49,6 @@ const EXPECTED_BASELINE = [
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
   // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
-  // Issue #692: AST-based semantic chunking
-  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
   // Issue #629 batch embedding fix
@@ -60,10 +58,10 @@ const EXPECTED_BASELINE = [
   { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] },
-  // Issue #606 SDK migration Bug 2 regression tests
-  { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" },
   // Issue #680 regression tests
   { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] },
+  // Issue #606 SDK migration Bug 2 regression tests
+  { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" },
   // Issue #736 recall governance - isRecallUsed() unit tests
   { group: "core-regression", runner: "node", file: "test/is-recall-used.test.mjs", args: ["--test"] },
   // Issue #492 agentId validation tests

From 3c603d6c3c925fd2bf8b6dad5161e203cb9b32bf Mon Sep 17 00:00:00 2001
From: James Lin <jlin53882@gmail.com>
Date: Tue, 5 May 2026 20:54:01 +0800
Subject: [PATCH 6/6] fix(ci): add ast-code-chunking + issue606 to manifest,
 rebuild verify baseline (merge sync)

---
 docs/issue-692-ast-chunking-design.md | 256 --------------------------
 scripts/ci-test-manifest.mjs          |   2 +
 scripts/verify-ci-test-manifest.mjs   |   8 +-
 3 files changed, 3 insertions(+), 263 deletions(-)
 delete mode 100644 docs/issue-692-ast-chunking-design.md

diff --git a/docs/issue-692-ast-chunking-design.md b/docs/issue-692-ast-chunking-design.md
deleted file mode 100644
index 11653348..00000000
--- a/docs/issue-692-ast-chunking-design.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# Issue #692 — AST-based Semantic Chunking for Code Blocks
-
-**Status:** Designed
-**Repo:** `memory-lancedb-pro`
-**Created:** 2026-05-04
-**Source:** https://github.com/CortexReach/memory-lancedb-pro/issues/692
-
----
-
-## Problem Summary
-
-`chunker.ts` 的 `smartChunk()` 使用純 character-based split，split 邏輯在 `findSplitEnd()`：
-- 先找 sentence ending（`.!！？`）
-- 找不到 → 找 `\n`
-- 找不到 → 找 whitespace
-
-這對自然語言有效，但對程式碼是災難。JS/TS 函式結尾是 `;` 和 `}`，兩者都不在 target set，導致 function declaration 在 `{` / `}` 之間被隨機切斷。
-
-**真實破壞案例：**
-```
-Chunk A（~3800字）：
-"async function handleUserLogin(userId: string, credentials: LoginCredentials): Promise<AuthResult> {\n"
-"    const user = await this.userRepository.findById(userId);\n"
-"    if (!user) {\n"
-"        return { success: false, error: 'USER_NOT_FOUND' };"
-
-Chunk B（~900字）：
-"    }\n"
-"    const passwordValid = await this.verifyPassword(...);"  // verifyPassword 跨 Chunk A 和 B
-```
-
-**問題：**
-- Chunk A 結尾在 `return { success: false, error: 'USER_NOT_FOUND' };` — 不完整的 if-block
-- Chunk B 開頭是 `}` — 脫離語境的 closing brace
-- `verifyPassword` 函式定義被切成兩段
-
----
-
-## Verified Facts (gitnexus + source reading)
-
-### Call Graph (gitnexus verified)
-```
-smartChunk (chunker.ts:263-281)
-  ├─ calls: getCjkRatio (174-183), chunkDocument (194-255)
-  │
-  └─ called by:
-       ├─ embedSingle (embedder.ts)
-       ├─ embedMany (embedder.ts)
-       ├─ testCjkAwareChunkSizing (test/cjk-recursion-regression.test.mjs)
-       └─ testSmallContextChunking (test/cjk-recursion-regression.test.mjs)
-
-chunkDocument (chunker.ts:194-255)
-  ├─ calls: findSplitEnd (97-143), sliceTrimWithIndices (146-163)
-  └─ called by: smartChunk
-
-findSplitEnd (chunker.ts:97-143)  ← 問題根因所在
-```
-
-### Existing Coverage
-- **測試：** 只有 `test/cjk-recursion-regression.test.mjs` 呼叫 `smartChunk`，**沒有任何專門測試 chunker 破壞案例的測試檔案**
-- **依賴：** 無 tree-sitter
-- **Config:** `maxChunkSize`, `overlapSize`, `minChunkSize`, `semanticSplit`, `maxLinesPerChunk`
-
----
-
-## Solution: astChunk()
-
-### Architecture
-
-```
-smartChunk(text)
-  ├─ detectCodeLanguage(text) === null  → chunkDocument()  [現有 character split]
-  └─ detectCodeLanguage(text) === 'js'/'ts' → astChunk(text, lang, config)
-                               === 'py'   → astChunk(text, 'python', config)
-                               === 其他   → chunkDocument()  [fallback]
-```
-
-### 1. `detectCodeLanguage(text) → CodeLanguage | null`
-
-取前 200 字做偵測：
-
-| 語言 | Pattern |
-|------|---------|
-| JS/TS | `/\b(function\|const\s\|let\s\|var\s\|=>\|import\s\|export\s\|interface\s\|type\s\|class\s)/` |
-| Python | `/\bdef\s\|class\s\|import\s\|from\s\|print\(/` |
-| Go | `/\bfunc\s\|package\s\|import\s"/` |
-| Rust | `/\bfn\s\|impl\s\|pub\s\|let\s+mut\s/` |
-
-### 2. `astChunk(code, language, config) → ChunkResult`
-
-```typescript
-import Parser from 'tree-sitter';
-import JavaScript from 'tree-sitter-javascript';
-
-export function astChunk(
-  code: string,
-  language: CodeLanguage,
-  config: ChunkerConfig
-): ChunkResult {
-  const parser = new Parser();
-  switch (language) {
-    case 'javascript':
-    case 'typescript':
-      parser.setLanguage(JavaScript);
-      break;
-    case 'python':
-      parser.setLanguage(Python);
-      break;
-    default:
-      return chunkDocument(code, config);
-  }
-
-  const tree = parser.parse(code);
-  const chunks: string[] = [];
-  const metadatas: ChunkMetadata[] = [];
-
-  // Walk top-level nodes
-  const root = tree.rootNode;
-  for (const child of root.children) {
-    if (!isDeclarationNode(child)) continue;
-    const text = code.slice(child.startIndex, child.endIndex);
-    if (text.length <= config.maxChunkSize) {
-      chunks.push(text);
-      metadatas.push({ startIndex: child.startIndex, endIndex: child.endIndex, length: text.length });
-    } else {
-      // Sub-split within this declaration at statement level
-      const subResult = subChunk(text, config);
-      chunks.push(...subResult.chunks);
-      metadatas.push(...subResult.metadatas);
-    }
-  }
-
-  return { chunks, metadatas, totalOriginalLength: code.length, chunkCount: chunks.length };
-}
-```
-
-### 3. Supported Node Types (Phase 1)
-
-| 語言 | P0 節點 |
-|------|---------|
-| JS/TS | `function_declaration`, `arrow_function`, `class_declaration`, `method_definition`, `export_statement`, `interface_declaration`, `type_alias_declaration`, `lexical_declaration` |
-| Python | `function_definition`, `class_definition`, `decorated_definition` |
-| Go | `function_declaration`, `method_declaration` (P2) |
-| Rust | `function_item`, `impl_item` (P2) |
-
-### 4. Config Extension
-
-```typescript
-interface ChunkerConfig {
-  // ... 現有五個欄位 ...
-  astAwareCodeSplit?: boolean;  // NEW: default true
-}
-```
-
-### 5. Dependency Changes
-
-```json
-{
-  "dependencies": {
-    "tree-sitter": "^0.21.1",
-    "tree-sitter-javascript": "^0.21.0",
-    "tree-sitter-python": "^0.21.0"
-  }
-}
-```
-
----
-
-## Files to Change
-
-| 檔案 | 變更 |
-|------|------|
-| `src/chunker.ts` | + `detectCodeLanguage()`, + `astChunk()`, + `subChunk()`, 修改 `smartChunk()` 路由, + `astAwareCodeSplit` config |
-| `src/chunker.test.ts` | **全新建立**（從破壞案例反轉）|
-| `package.json` | + tree-sitter, tree-sitter-javascript, tree-sitter-python |
-
----
-
-## Tests (New File)
-
-```typescript
-describe('AST-aware code chunking', () => {
-  it('should keep { and } balanced in every chunk', () => {
-    const code = `async function handleUserLogin(userId: string) {
-    const user = await this.userRepository.findById(userId);
-    if (!user) { return { success: false }; }
-    const session = await this.createSession(user);
-    return { success: true, session };
-}
-async function verifyPassword(input: string): Promise<boolean> {
-    return bcrypt.compare(input, this.hash);
-}`;
-    const result = smartChunk(code, 'jina-embeddings-v5');
-    for (const chunk of result.chunks) {
-      const opens = (chunk.match(/{/g) || []).length;
-      const closes = (chunk.match(/}/g) || []).length;
-      expect(opens).toBe(closes);
-    }
-  });
-
-  it('should not split function mid-body', () => {
-    const result = smartChunk(code, 'jina-embeddings-v5');
-    const hasMiddleOfFunction = result.chunks.some(c =>
-      c.startsWith('}') || c.endsWith('{')
-    );
-    expect(hasMiddleOfFunction).toBe(false);
-  });
-
-  it('should keep complete function as one chunk', () => {
-    const result = smartChunk(code, 'jina-embeddings-v5');
-    const verifyFn = result.chunks.find(c => c.includes('verifyPassword'));
-    expect(verifyFn).toBeDefined();
-    expect(verifyFn).toContain('bcrypt.compare');
-    expect(verifyFn).not.toContain('handleUserLogin');
-  });
-});
-```
-
----
-
-## Phase Plan
-
-```
-Phase 1（P0 — MVP）：
-  ├─ detectCodeLanguage()（JS/TS/Python）
-  ├─ astChunk() — JS/TS only
-  ├─ astChunk() — Python
-  ├─ Unit tests（破壞案例 → 通過案例）
-  └─ Config: astAwareCodeSplit default = true
-
-Phase 2（P1）：
-  ├─ Sub-split within oversized declarations（statement level）
-  ├─ Go、Rust support
-  └─ Benchmark: 向量品質 vs. character split
-
-Phase 3（P2）：
-  └─ Embedding quality evaluation（問答對比）
-```
-
----
-
-## Q&A
-
-| Q | A |
-|---|---|
-| tree-sitter 值得嗎？ | **值得**。~1MB runtime，sub-ms parse，能處理巢狀結構/decorator/subclass，比 regex 精準一個數量級。 |
-| 預設開？ | **預設開**。破壞案例太明確，等使用者手動開等於功能永遠不被用。`astAwareCodeSplit: false` 保留給需要復現舊行為的測試。 |
-| 非主流語言？ | **Phase 1 fallback**。現有 sentence-ending split 對自然語言有效；非主流語言佔比低，Phase 1 fallback 合理。 |
-
----
-
-## Reference
-
-- Issue: https://github.com/CortexReach/memory-lancedb-pro/issues/692
-- Reference impl: `zilliztech/claude-context` ast-splitter.ts
-- Existing chunker: `src/chunker.ts` (284 lines)
diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs
index bdb31ce1..44d48747 100644
--- a/scripts/ci-test-manifest.mjs
+++ b/scripts/ci-test-manifest.mjs
@@ -48,6 +48,8 @@ export const CI_TEST_MANIFEST = [
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
   // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
+  // Issue #692: AST-based semantic chunking
+  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
   // Issue #629 batch embedding fix
diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs
index 8577fa3f..689a8ef0 100644
--- a/scripts/verify-ci-test-manifest.mjs
+++ b/scripts/verify-ci-test-manifest.mjs
@@ -47,24 +47,18 @@ const EXPECTED_BASELINE = [
   { group: "core-regression", runner: "node", file: "test/preference-slots.test.mjs", args: ["--test"] },
   { group: "core-regression", runner: "node", file: "test/is-latest-auto-supersede.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
-  // Issue #598 regression tests
   { group: "core-regression", runner: "node", file: "test/store-serialization.test.mjs" },
+  { group: "core-regression", runner: "node", file: "test/ast-code-chunking.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/access-tracker-retry.test.mjs" },
   { group: "core-regression", runner: "node", file: "test/embedder-cache.test.mjs" },
-  // Issue #629 batch embedding fix
   { group: "llm-clients-and-auth", runner: "node", file: "test/embedder-ollama-batch-routing.test.mjs" },
-  // Issue #665 bulkStore tests
   { group: "storage-and-schema", runner: "node", file: "test/bulk-store.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/bulk-store-edge-cases.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store.test.mjs", args: ["--test"] },
   { group: "storage-and-schema", runner: "node", file: "test/smart-extractor-bulk-store-edge-cases.test.mjs", args: ["--test"] },
-  // Issue #680 regression tests
   { group: "core-regression", runner: "node", file: "test/memory-reflection-issue680-tdd.test.mjs", args: ["--test"] },
-  // Issue #606 SDK migration Bug 2 regression tests
   { group: "core-regression", runner: "node", file: "test/issue606_sdk-migration.test.mjs" },
-  // Issue #736 recall governance - isRecallUsed() unit tests
   { group: "core-regression", runner: "node", file: "test/is-recall-used.test.mjs", args: ["--test"] },
-  // Issue #492 agentId validation tests
   { group: "core-regression", runner: "node", file: "test/agentid-validation.test.mjs", args: ["--test"] },
   { group: "core-regression", runner: "node", file: "test/command-reflection-guard.test.mjs", args: ["--test"] },
 ];