From 2aa08315103fa1b87b20d4f212ab271caeee670c Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Sat, 28 Feb 2026 16:39:35 +0100 Subject: [PATCH 1/2] feat(refs): tree-sitter identifier-aware symbol references --- src/core/symbol-references.ts | 130 +++++++++++++++++++++++---- src/utils/tree-sitter.ts | 135 ++++++++++++++++++++++++++++ tests/get-symbol-references.test.ts | 94 +++++++++++++++++++ 3 files changed, 341 insertions(+), 18 deletions(-) diff --git a/src/core/symbol-references.ts b/src/core/symbol-references.ts index 73c7a0f..1cc4f70 100644 --- a/src/core/symbol-references.ts +++ b/src/core/symbol-references.ts @@ -3,6 +3,8 @@ import path from 'path'; import { CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME } from '../constants/codebase-context.js'; import { IndexCorruptedError } from '../errors/index.js'; import type { UsageLocation } from '../types/index.js'; +import { detectLanguage } from '../utils/language-detection.js'; +import { findIdentifierOccurrences } from '../utils/tree-sitter.js'; interface IndexedChunk { content?: unknown; @@ -59,6 +61,37 @@ function buildPreview(content: string, lineOffset: number): string { return previewLines.join('\n').trim(); } +function buildPreviewFromFileLines(lines: string[], line: number): string { + const start = Math.max(0, line - 2); + const end = Math.min(lines.length, line + 1); + return lines.slice(start, end).join('\n').trim(); +} + +function resolveAbsoluteChunkPath(rootPath: string, chunk: IndexedChunk): string | null { + if (typeof chunk.filePath === 'string' && chunk.filePath.trim()) { + const raw = chunk.filePath.trim(); + if (path.isAbsolute(raw)) { + return raw; + } + return path.resolve(rootPath, raw); + } + + if (typeof chunk.relativePath === 'string' && chunk.relativePath.trim()) { + return path.resolve(rootPath, chunk.relativePath.trim()); + } + + return null; +} + +async function fileExists(targetPath: string): Promise { + try { + const stat = await fs.stat(targetPath); + return stat.isFile(); + } catch { + return false; + } +} + export async function findSymbolReferences( rootPath: string, symbol: string, @@ -110,34 +143,95 @@ export async function findSymbolReferences( let usageCount = 0; const escapedSymbol = escapeRegex(normalizedSymbol); + const prefilter = new RegExp(`\\b${escapedSymbol}\\b`); const matcher = new RegExp(`\\b${escapedSymbol}\\b`, 'g'); + // Prefilter candidate files from the keyword index. We do not trust chunk contents for + // exact reference counting when Tree-sitter is available; chunks only guide which files to scan. + const chunksByFile = new Map< + string, + { relPath: string; absPath: string | null; chunks: IndexedChunk[] } + >(); + for (const chunkRaw of chunks) { const chunk = chunkRaw as IndexedChunk; - if (typeof chunk.content !== 'string') { - continue; + if (typeof chunk.content !== 'string') continue; + if (!prefilter.test(chunk.content)) continue; + + const relPath = getUsageFile(rootPath, chunk); + const absPath = resolveAbsoluteChunkPath(rootPath, chunk); + + const entry = chunksByFile.get(relPath); + if (entry) { + entry.chunks.push(chunk); + // Prefer a real absolute path when available + if (!entry.absPath && absPath) { + entry.absPath = absPath; + } + } else { + chunksByFile.set(relPath, { relPath, absPath, chunks: [chunk] }); + } + } + + for (const entry of chunksByFile.values()) { + const relPath = entry.relPath; + const absPath = entry.absPath; + + // Preferred: Tree-sitter identifier walk on the real file content. + if (absPath && (await fileExists(absPath))) { + try { + const raw = await fs.readFile(absPath, 'utf-8'); + const content = raw.replace(/\r\n/g, '\n'); + const language = detectLanguage(absPath); + const occurrences = await findIdentifierOccurrences(content, language, normalizedSymbol); + + if (occurrences) { + usageCount += occurrences.length; + + if (usages.length < normalizedLimit && occurrences.length > 0) { + const lines = content.split('\n'); + for (const occ of occurrences) { + if (usages.length >= normalizedLimit) break; + usages.push({ + file: relPath, + line: occ.line, + preview: buildPreviewFromFileLines(lines, occ.line) + }); + } + } + + continue; + } + } catch { + // Fall through to chunk-regex fallback (missing grammar, parse failure, etc.) + } } - const chunkContent = chunk.content; - const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1; - matcher.lastIndex = 0; + // Fallback: regex scan inside the matched chunks (legacy behavior). + for (const chunk of entry.chunks) { + if (typeof chunk.content !== 'string') continue; - let match: RegExpExecArray | null; - while ((match = matcher.exec(chunkContent)) !== null) { - usageCount += 1; + const chunkContent = chunk.content; + const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1; + matcher.lastIndex = 0; - if (usages.length >= normalizedLimit) { - continue; - } + let match: RegExpExecArray | null; + while ((match = matcher.exec(chunkContent)) !== null) { + usageCount += 1; + + if (usages.length >= normalizedLimit) { + continue; + } - const prefix = chunkContent.slice(0, match.index); - const lineOffset = prefix.split('\n').length - 1; + const prefix = chunkContent.slice(0, match.index); + const lineOffset = prefix.split('\n').length - 1; - usages.push({ - file: getUsageFile(rootPath, chunk), - line: startLine + lineOffset, - preview: buildPreview(chunkContent, lineOffset) - }); + usages.push({ + file: relPath, + line: startLine + lineOffset, + preview: buildPreview(chunkContent, lineOffset) + }); + } } } diff --git a/src/utils/tree-sitter.ts b/src/utils/tree-sitter.ts index d84d611..f24d69c 100644 --- a/src/utils/tree-sitter.ts +++ b/src/utils/tree-sitter.ts @@ -381,3 +381,138 @@ export async function extractTreeSitterSymbols( return null; } } + +export interface IdentifierOccurrence { + line: number; + startIndex: number; + endIndex: number; + nodeType: string; +} + +const IDENTIFIER_NODE_TYPES = [ + 'identifier', + 'type_identifier', + 'property_identifier', + 'field_identifier', + 'shorthand_property_identifier_pattern', + 'shorthand_property_identifier', + 'jsx_identifier', + 'scoped_identifier' +] as const; + +const NON_CODE_ANCESTOR_TYPE_FRAGMENTS = [ + 'comment', + 'string', + 'template_string', + 'regex', + 'jsx_text' +] as const; + +function isInsideNonCodeContext(node: Node): boolean { + let cursor: Node | null = node; + let depth = 0; + while (cursor && depth < 40) { + const cursorType = cursor.type; + for (const fragment of NON_CODE_ANCESTOR_TYPE_FRAGMENTS) { + if (cursorType.includes(fragment)) { + return true; + } + } + cursor = cursor.parent; + depth += 1; + } + return false; +} + +/** + * Find identifier occurrences of `symbol` in `content` using Tree-sitter. + * Returns null when Tree-sitter isn't available/supported, so callers can fall back safely. + */ +export async function findIdentifierOccurrences( + content: string, + language: string, + symbol: string +): Promise { + const normalizedSymbol = symbol.trim(); + if (!normalizedSymbol) { + return []; + } + + if (!supportsTreeSitter(language) || !content.trim()) { + return null; + } + + if (Buffer.byteLength(content, 'utf8') > MAX_TREE_SITTER_PARSE_BYTES) { + return null; + } + + try { + const parser = await getParserForLanguage(language); + setParseTimeout(parser); + + let tree: ReturnType; + try { + tree = parser.parse(content); + } catch (error) { + evictParser(language, parser); + throw error; + } + + if (!tree) { + evictParser(language, parser); + return null; + } + + try { + const hasErrorValue = tree.rootNode.hasError as unknown; + const rootHasError = + typeof hasErrorValue === 'function' + ? Boolean((hasErrorValue as () => unknown)()) + : Boolean(hasErrorValue); + + if (rootHasError) { + return null; + } + + const nodes = tree.rootNode.descendantsOfType([...IDENTIFIER_NODE_TYPES]); + const occurrences: IdentifierOccurrence[] = []; + const seen = new Set(); + + for (const node of nodes) { + if (!node || !node.isNamed) continue; + if (node.text !== normalizedSymbol) continue; + if (isInsideNonCodeContext(node)) continue; + + const occ: IdentifierOccurrence = { + line: node.startPosition.row + 1, + startIndex: node.startIndex, + endIndex: node.endIndex, + nodeType: node.type + }; + const key = `${occ.line}:${occ.startIndex}:${occ.endIndex}:${occ.nodeType}`; + if (seen.has(key)) continue; + seen.add(key); + occurrences.push(occ); + } + + occurrences.sort((a, b) => { + if (a.line !== b.line) return a.line - b.line; + return a.startIndex - b.startIndex; + }); + + return occurrences; + } finally { + tree.delete(); + } + } catch (error) { + evictParser(language); + + if (isTreeSitterDebugEnabled()) { + console.error( + `[DEBUG] Tree-sitter identifier occurrence scan failed for '${language}':`, + error instanceof Error ? error.message : String(error) + ); + } + return null; + } +} diff --git a/tests/get-symbol-references.test.ts b/tests/get-symbol-references.test.ts index af33f94..819950e 100644 --- a/tests/get-symbol-references.test.ts +++ b/tests/get-symbol-references.test.ts @@ -136,6 +136,100 @@ describe('get_symbol_references MCP tool', () => { } }); + it('excludes comment and string matches when file exists and Tree-sitter is supported', async () => { + if (!tempRoot) { + throw new Error('tempRoot not initialized'); + } + + const contextDir = path.join(tempRoot, CODEBASE_CONTEXT_DIRNAME); + await fs.rm(contextDir, { recursive: true, force: true }); + await fs.mkdir(contextDir, { recursive: true }); + + const buildId = 'test-build-symbol-refs-treesitter'; + const generatedAt = new Date().toISOString(); + + await fs.mkdir(path.join(contextDir, 'index'), { recursive: true }); + await fs.writeFile( + path.join(contextDir, 'index', 'index-build.json'), + JSON.stringify({ buildId, formatVersion: INDEX_FORMAT_VERSION }), + 'utf-8' + ); + + const srcDir = path.join(tempRoot, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + const filePath = path.join(srcDir, 'example.ts'); + const fileContent = [ + 'const alpha = 1;', + '// alpha only in comment', + 'console.log(alpha);', + 'const s = \"alpha\";' + ].join('\n'); + await fs.writeFile(filePath, fileContent, 'utf-8'); + + const chunks = [ + { + content: fileContent, + startLine: 1, + relativePath: 'src/example.ts', + filePath + } + ]; + + await fs.writeFile( + path.join(contextDir, KEYWORD_INDEX_FILENAME), + JSON.stringify({ header: { buildId, formatVersion: INDEX_FORMAT_VERSION }, chunks }), + 'utf-8' + ); + + await fs.writeFile( + path.join(contextDir, INDEX_META_FILENAME), + JSON.stringify( + { + metaVersion: INDEX_META_VERSION, + formatVersion: INDEX_FORMAT_VERSION, + buildId, + generatedAt, + toolVersion: 'test', + artifacts: { + keywordIndex: { path: KEYWORD_INDEX_FILENAME }, + vectorDb: { path: 'index', provider: 'lancedb' } + } + }, + null, + 2 + ), + 'utf-8' + ); + + const { server } = await import('../src/index.js'); + const handler = (server as any)._requestHandlers.get('tools/call'); + + const response = await handler({ + jsonrpc: '2.0', + id: 1, + method: 'tools/call', + params: { + name: 'get_symbol_references', + arguments: { + symbol: 'alpha', + limit: 10 + } + } + }); + + const payload = JSON.parse(response.content[0].text) as unknown as { + status: string; + usageCount: number; + usages: Array<{ file: string; line: number }>; + }; + expect(payload.status).toBe('success'); + expect(payload.usageCount).toBe(2); + expect(payload.usages.length).toBe(2); + expect(payload.usages.every((usage) => usage.file === 'src/example.ts')).toBe(true); + const lines = payload.usages.map((usage) => usage.line).sort((a, b) => a - b); + expect(lines).toEqual([1, 3]); + }); + it('isComplete is true when results are less than limit', async () => { if (!tempRoot) { throw new Error('tempRoot not initialized'); From 1735e3cb51f808c3bd1c9afed4f1139bad851e8f Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Sat, 28 Feb 2026 16:58:03 +0100 Subject: [PATCH 2/2] fix(refs): prevent out-of-root file reads from index --- src/core/symbol-references.ts | 15 +++++++++++--- tests/get-symbol-references.test.ts | 32 +++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/core/symbol-references.ts b/src/core/symbol-references.ts index 1cc4f70..f4c1986 100644 --- a/src/core/symbol-references.ts +++ b/src/core/symbol-references.ts @@ -68,16 +68,25 @@ function buildPreviewFromFileLines(lines: string[], line: number): string { } function resolveAbsoluteChunkPath(rootPath: string, chunk: IndexedChunk): string | null { + const resolvedRoot = path.resolve(rootPath); + const isWithinRoot = (candidate: string): boolean => { + const resolvedCandidate = path.resolve(candidate); + const relative = path.relative(resolvedRoot, resolvedCandidate); + return Boolean(relative) && !relative.startsWith('..') && !path.isAbsolute(relative); + }; + if (typeof chunk.filePath === 'string' && chunk.filePath.trim()) { const raw = chunk.filePath.trim(); if (path.isAbsolute(raw)) { - return raw; + return isWithinRoot(raw) ? raw : null; } - return path.resolve(rootPath, raw); + const resolved = path.resolve(resolvedRoot, raw); + return isWithinRoot(resolved) ? resolved : null; } if (typeof chunk.relativePath === 'string' && chunk.relativePath.trim()) { - return path.resolve(rootPath, chunk.relativePath.trim()); + const resolved = path.resolve(resolvedRoot, chunk.relativePath.trim()); + return isWithinRoot(resolved) ? resolved : null; } return null; diff --git a/tests/get-symbol-references.test.ts b/tests/get-symbol-references.test.ts index 819950e..6481b46 100644 --- a/tests/get-symbol-references.test.ts +++ b/tests/get-symbol-references.test.ts @@ -10,6 +10,30 @@ import { KEYWORD_INDEX_FILENAME } from '../src/constants/codebase-context.js'; +type ToolCallRequest = { + jsonrpc: '2.0'; + id: number; + method: 'tools/call'; + params: { name: string; arguments: Record }; +}; + +type ToolCallResponse = { + content: Array<{ type: 'text'; text: string }>; + isError?: boolean; +}; + +function getToolCallHandler(server: unknown): (request: ToolCallRequest) => Promise { + const handlers = (server as { _requestHandlers?: unknown })._requestHandlers; + if (!(handlers instanceof Map)) { + throw new Error('Expected server._requestHandlers to be a Map'); + } + const handler = handlers.get('tools/call'); + if (typeof handler !== 'function') { + throw new Error('Expected tools/call handler to be registered'); + } + return handler as (request: ToolCallRequest) => Promise; +} + describe('get_symbol_references MCP tool', () => { let tempRoot: string | null = null; let originalArgv: string[] | null = null; @@ -106,7 +130,7 @@ describe('get_symbol_references MCP tool', () => { ); const { server } = await import('../src/index.js'); - const handler = (server as any)._requestHandlers.get('tools/call'); + const handler = getToolCallHandler(server); const response = await handler({ jsonrpc: '2.0', @@ -202,7 +226,7 @@ describe('get_symbol_references MCP tool', () => { ); const { server } = await import('../src/index.js'); - const handler = (server as any)._requestHandlers.get('tools/call'); + const handler = getToolCallHandler(server); const response = await handler({ jsonrpc: '2.0', @@ -283,7 +307,7 @@ describe('get_symbol_references MCP tool', () => { ); const { server } = await import('../src/index.js'); - const handler = (server as any)._requestHandlers.get('tools/call'); + const handler = getToolCallHandler(server); const response = await handler({ jsonrpc: '2.0', @@ -362,7 +386,7 @@ describe('get_symbol_references MCP tool', () => { ); const { server } = await import('../src/index.js'); - const handler = (server as any)._requestHandlers.get('tools/call'); + const handler = getToolCallHandler(server); const response = await handler({ jsonrpc: '2.0',