Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 118 additions & 13 deletions src/codeweaver/core/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
from codeweaver.core import BasedModel, ResolvedProjectPathDep
from codeweaver.core.chunks import CodeChunk
from codeweaver.core.di import INJECTED
from codeweaver.core.language import is_semantic_config_ext
from codeweaver.core.language import SemanticSearchLanguage, is_semantic_config_ext
from codeweaver.core.metadata import ExtCategory
from codeweaver.core.types import MISSING, BlakeHashKey, BlakeKey, Missing
from codeweaver.core.types import MISSING, BlakeHashKey, BlakeKey, FileExt, LiteralStringT, Missing
from codeweaver.core.utils import (
get_blake_hash,
get_git_branch,
Expand All @@ -56,6 +56,101 @@
logger = logging.getLogger(__name__)


def _walk_ast_nodes(node: Any, parts: list[str]) -> None:
"""Walk AST tree and collect semantic tokens, excluding comments.

Traverses the full AST including both named and unnamed nodes to capture
all semantically meaningful content (identifiers, operators, keywords, etc.)
while skipping comment nodes.
"""
kind: str = node.kind()
if "comment" in kind.lower():
return
children = node.children()
if not children:
parts.append(f"{kind}:{node.text()}")
else:
parts.append(kind)
for child in children:
_walk_ast_nodes(child, parts)


def _compute_ast_hash(content: str, language_name: str) -> BlakeHashKey | None:
"""Compute a blake3 hash from the AST representation, excluding comments.

Parse the content using ast-grep and build a canonical string from the AST
node kinds and leaf text values. Comment nodes are excluded so that comment-only
changes do not alter the hash. Whitespace and formatting are inherently normalized
by the AST since they are not represented as tree nodes.

Return None if parsing fails or produces no meaningful nodes.
"""
try:
from ast_grep_py import SgRoot

root = SgRoot(content, language_name)
node = root.root()
parts: list[str] = []
_walk_ast_nodes(node, parts)
if not parts:
return None
canonical = "\n".join(parts)
return get_blake_hash(canonical)
except Exception:
logger.debug("AST parsing failed for language %s", language_name, exc_info=True)
return None
except BaseException as exc:
# pyo3_runtime.PanicException (from ast-grep's Rust backend) inherits
# from BaseException, not Exception, and cannot be imported directly.
# Identify it by module name so we degrade gracefully without swallowing
# unrelated non-recoverable errors (GeneratorExit, etc.).
if getattr(type(exc), "__module__", None) == "pyo3_runtime":
logger.debug("AST parsing panicked for language %s", language_name, exc_info=True)
return None
raise


def _get_semantic_language(
file_path: Path, ext_category: ExtCategory | None = None
) -> SemanticSearchLanguage | None:
"""Return the SemanticSearchLanguage for a file, or None if unsupported."""
if ext_category and isinstance(ext_category.language, SemanticSearchLanguage):
return ext_category.language
return SemanticSearchLanguage.from_extension(
FileExt(cast(LiteralStringT, file_path.suffix or file_path.name))
)


def compute_semantic_file_hash(
content_bytes: bytes,
file_path: Path,
*,
ext_category: ExtCategory | None = None,
) -> BlakeHashKey:
"""Compute a file hash using AST-based hashing for supported semantic languages.

For files with a supported AST language (Python, JavaScript, etc.), parse the file
to an AST and hash the canonical tree representation. This ignores comments,
whitespace, and formatting changes so that only genuine semantic modifications
trigger a different hash.

Fall back to a raw content blake3 hash for unsupported languages or when AST
parsing fails.
"""
if language := _get_semantic_language(file_path, ext_category):
try:
content_str = content_bytes.decode("utf-8", errors="replace")
if ast_hash := _compute_ast_hash(content_str, language.variable):
return ast_hash
except Exception:
logger.debug(
"AST hashing failed for %s, falling back to content hash",
file_path,
exc_info=True,
)
return get_blake_hash(content_bytes)


def _get_git_branch(path: Path) -> str | None:
"""Get the git branch for the given path, if available."""
try:
Expand Down Expand Up @@ -84,7 +179,7 @@ class DiscoveredFile(BasedModel):
_file_hash: Annotated[
BlakeHashKey | None,
Field(
description="blake3 hash of the file contents. File hashes are from non-normalized content, so two files with different line endings, white spaces, unicode characters, etc. will have different hashes."
description="blake3 hash of the file. For files with a supported AST language, the hash is computed from the canonical AST representation, ignoring comments, whitespace, and formatting. For other files, the hash is computed from the raw content bytes."
),
] = None
_git_branch: Annotated[
Expand Down Expand Up @@ -125,14 +220,17 @@ def __init__(
"""Initialize DiscoveredFile with optional file_hash and git_branch."""
object.__setattr__(self, "path", path)
object.__setattr__(self, "project_path", project_path)
if ext_category:
object.__setattr__(self, "ext_category", ext_category)
else:
object.__setattr__(self, "ext_category", ExtCategory.from_file(path))
resolved_ext = ext_category or ExtCategory.from_file(path)
object.__setattr__(self, "ext_category", resolved_ext)
if file_hash:
object.__setattr__(self, "_file_hash", file_hash)
elif path.is_file():
object.__setattr__(self, "_file_hash", get_blake_hash(path.read_bytes()))
content_bytes = path.read_bytes()
object.__setattr__(
self,
"_file_hash",
compute_semantic_file_hash(content_bytes, path, ext_category=resolved_ext),
)
else:
object.__setattr__(self, "_file_hash", None)
if git_branch and git_branch is not MISSING:
Expand Down Expand Up @@ -179,7 +277,10 @@ def from_path(
"""Create a DiscoveredFile from a file path."""
branch = get_git_branch(path if path.is_dir() else path.parent) or "main"
if ext_category := ExtCategory.from_file(path):
new_hash = get_blake_hash(path.read_bytes())
content_bytes = path.read_bytes()
new_hash = compute_semantic_file_hash(
content_bytes, path, ext_category=ext_category
)
if file_hash and new_hash != file_hash:
logger.warning(
"Provided file_hash does not match computed hash for %s. Using computed hash.",
Expand Down Expand Up @@ -240,11 +341,15 @@ def size(self) -> NonNegativeInt:
@computed_field
@property
def file_hash(self) -> BlakeHashKey:
"""Return the blake3 hash of the file contents, if available."""
"""Return the blake3 hash of the file, using AST-based hashing when supported."""
if self._file_hash is not None:
return self._file_hash
if self.path.exists() and self.path.is_file():
content_hash = get_blake_hash(self.path.read_bytes())
abs_path = self.absolute_path
if abs_path.exists() and abs_path.is_file():
content_bytes = abs_path.read_bytes()
content_hash = compute_semantic_file_hash(
content_bytes, abs_path, ext_category=self.ext_category
)
Comment on lines +344 to +352
Copy link

Copilot AI Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DiscoveredFile.file_hash reads via self.path.exists()/read_bytes(), but path is often stored relative to the project root (see set_relative_path usage in from_path and absolute_path). If the current working directory isn’t the project root, this branch can incorrectly treat an existing file as missing and fall back to get_blake_hash(b""). Prefer using self.absolute_path for the existence/read checks here (and when passing the path into compute_semantic_file_hash) to ensure consistent behavior regardless of CWD.

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot

Please correct the following two issues:

DiscoveredFile.file_hash reads via self.path.exists()/read_bytes(), but path is often stored relative to the project root (see set_relative_path usage in from_path and absolute_path). If the current working directory isn’t the project root, this branch can incorrectly treat an existing file as missing and fall back to get_blake_hash(b""). Prefer using self.absolute_path for the existence/read checks here (and when passing the path into compute_semantic_file_hash) to ensure consistent behavior regardless of CWD.

Let's also catch a narrower error than BaseException, like the pyO3 error specifically. We need to be careful because when Codeweaver moves to a monorepo there will be situations where pyO3 isn't a transitive dependency.

Offhand, I know watchfiles and rignore use it, but those are engine dependencies. Some providers use it (DuckDuckGo via primp, for example). We may need to make the catch or caught exception dependent on whether we have pyO3 in the environment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed both issues in ed23aa2:

  1. file_hash now uses self.absolute_path for existence checks and reads, matching the pattern already used in size, contents, and raw_contents.

  2. Narrowed exception handling in _compute_ast_hash:

    • except Exception catches all standard Python errors (including ImportError if ast_grep_py is missing)
    • except BaseException only fires for non-Exception subclasses; checks getattr(type(exc), "__module__", None) == "pyo3_runtime" to identify PanicException by module name, re-raising everything else (GeneratorExit, etc.)

    This works in environments without pyo3 since the ImportError from ast_grep_py falls into the first except Exception handler. The pyo3 check only activates when a Rust-side panic actually occurs — and PanicException can't be imported directly (it's a Rust-embedded type not registered in sys.modules), hence the module name check.

with contextlib.suppress(Exception):
object.__setattr__(self, "_file_hash", content_hash)
return content_hash
Expand Down Expand Up @@ -325,4 +430,4 @@ def normalize_content(content: str | bytes | bytearray) -> str:
return sanitize_unicode(content)


__all__ = ("DiscoveredFile",)
__all__ = ("DiscoveredFile", "compute_semantic_file_hash")
3 changes: 2 additions & 1 deletion src/codeweaver/engine/services/indexing_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
PRIMARY_SPARSE_VECTOR_NAME,
ZERO,
)
from codeweaver.core.discovery import compute_semantic_file_hash
from codeweaver.providers import EmbeddingRegistryDep


Expand Down Expand Up @@ -312,7 +313,7 @@ def _hash_batch(paths: list[Path]) -> list[tuple[Path, bytes]]:
continue

seen_files.add(relative_path)
current_hash = get_blake_hash(content_bytes)
current_hash = compute_semantic_file_hash(content_bytes, path)
if not self._file_manifest:
self._file_manifest = self._manifest_manager.create_new()
needs_reindex, _ = self._file_manifest.file_needs_reindexing(
Expand Down
Loading
Loading