From c835fc817bc40e9434e30bfdef43cee17300e7ea Mon Sep 17 00:00:00 2001 From: wei9072 Date: Thu, 7 May 2026 08:30:30 +0000 Subject: [PATCH] =?UTF-8?q?refactor(aegis-core):=20public=5Fsymbols=20/=20?= =?UTF-8?q?imported=5Fsymbols=20=E2=86=92=20Layer=201=20cache?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continues PR #10's pattern: any per-file fact whose extraction was duplicated across consumers belongs in `ast::*` with a `OnceCell` cache on `ParsedFile`. Both extractors used to live as private helpers inside `workspace.rs::summarize_file`. They moved to a new `ast::symbols` module: - `extract_public_symbols(parsed) -> HashSet` — top-level function / class / type / trait / variable names this file exposes. Skips function bodies (nested local helpers are not the file's public API). Rust requires `pub` modifier. - `extract_imported_symbols(parsed) -> HashSet` — names pulled in via `from X import Y` (Python) or `import { a, b } from 'x'` (TS / JS). - `walk_export()` — preserved verbatim for `export default` / `export { a, b as c }` shapes. - `is_public_name()` / `is_likely_public()` — moved alongside. `ParsedFile` grows two more lazy caches: - `public_symbols() -> &HashSet` - `imported_symbols() -> &HashSet` `workspace.rs::summarize_file` now reads both caches instead of walking the tree itself. Net: -158 / +257 (the +257 includes 100 lines of moved code + 50 lines of new tests inside ast::symbols + 21 lines of new ParsedFile API). 3 new unit tests in `ast::symbols`: Python public-fn / Python private-skip, Rust pub-only filter, Python `from … import …` including `aliased_import`. `workspace.rs` unchanged behaviour-wise; same 11 existing workspace tests still pass. Total: 164 → 167 tests pass. Architectural rationale: this is the third per-file fact that moved into Layer 1 (after `Import` in PR #10 and Layer 1 was already provider for `ParsedFile`). The pattern is now stable enough that any future "per-file derived fact" (call_sites, function definitions, etc.) just adds another `OnceCell` field + `extract_X` function. Co-Authored-By: Claude Opus 4.7 --- crates/aegis-core/src/ast/mod.rs | 2 + crates/aegis-core/src/ast/parsed_file.rs | 21 +++ crates/aegis-core/src/ast/symbols.rs | 228 +++++++++++++++++++++++ crates/aegis-core/src/workspace.rs | 164 +--------------- 4 files changed, 257 insertions(+), 158 deletions(-) create mode 100644 crates/aegis-core/src/ast/symbols.rs diff --git a/crates/aegis-core/src/ast/mod.rs b/crates/aegis-core/src/ast/mod.rs index 929f150..21eb029 100644 --- a/crates/aegis-core/src/ast/mod.rs +++ b/crates/aegis-core/src/ast/mod.rs @@ -3,8 +3,10 @@ pub mod imports; pub mod languages; pub mod parsed_file; pub mod registry; +pub mod symbols; pub use adapter::{default_max_chain_depth, LanguageAdapter}; pub use imports::{extract_imports, Import}; pub use parsed_file::{parse, ParsedFile}; pub use registry::LanguageRegistry; +pub use symbols::{extract_imported_symbols, extract_public_symbols}; diff --git a/crates/aegis-core/src/ast/parsed_file.rs b/crates/aegis-core/src/ast/parsed_file.rs index e57b000..7728e44 100644 --- a/crates/aegis-core/src/ast/parsed_file.rs +++ b/crates/aegis-core/src/ast/parsed_file.rs @@ -12,10 +12,12 @@ //! not a hard short-circuit baked into the parse layer. use std::cell::OnceCell; +use std::collections::HashSet; use crate::ast::adapter::LanguageAdapter; use crate::ast::imports::{extract_imports, Import}; use crate::ast::registry::LanguageRegistry; +use crate::ast::symbols::{extract_imported_symbols, extract_public_symbols}; /// Output of a successful parse — tree + source + the adapter that /// produced it. Cheap to pass by reference; nothing here is cloned. @@ -30,6 +32,8 @@ pub struct ParsedFile<'src> { source: &'src str, language_name: &'static str, imports_cache: OnceCell>, + public_symbols_cache: OnceCell>, + imported_symbols_cache: OnceCell>, } impl<'src> ParsedFile<'src> { @@ -87,6 +91,21 @@ impl<'src> ParsedFile<'src> { self.imports_cache.get_or_init(|| extract_imports(self)) } + /// Top-level public symbols this file exposes (functions, classes, + /// types, traits, exports). Lazily extracted; the same `HashSet` + /// is returned on subsequent calls. + pub fn public_symbols(&self) -> &HashSet { + self.public_symbols_cache + .get_or_init(|| extract_public_symbols(self)) + } + + /// Names this file pulls in via its own imports (Python + /// `from X import Y`, TS `import { a, b } from 'x'`). Lazy. + pub fn imported_symbols(&self) -> &HashSet { + self.imported_symbols_cache + .get_or_init(|| extract_imported_symbols(self)) + } + /// Best-effort receiver-to-import lookup: given a call receiver /// like `rand` or `myrand`, return the matching `Import` if one /// of the file's imports plausibly produced that name. @@ -140,6 +159,8 @@ pub fn parse<'src>(path: &str, source: &'src str) -> Option> { source, language_name: adapter.name(), imports_cache: OnceCell::new(), + public_symbols_cache: OnceCell::new(), + imported_symbols_cache: OnceCell::new(), }) } diff --git a/crates/aegis-core/src/ast/symbols.rs b/crates/aegis-core/src/ast/symbols.rs new file mode 100644 index 0000000..74f9649 --- /dev/null +++ b/crates/aegis-core/src/ast/symbols.rs @@ -0,0 +1,228 @@ +//! Per-file symbol extraction — Layer 1 fact derivation. +//! +//! Mirrors the architecture of `ast::imports`: pulls the +//! cross-file public-symbol / imported-symbol extraction logic out +//! of `workspace.rs::summarize_file` into Layer 1 so any consumer +//! (security checks, future signal rules, the existing R2 +//! workspace finding) can read the same canonical list without +//! re-walking the tree. +//! +//! Two collections per file: +//! +//! - **`public_symbols`** — top-level function / class / type / +//! trait / variable names this file exposes to its callers. +//! Powers the `public_symbol_removed` workspace finding. +//! - **`imported_symbols`** — names this file pulls in from its +//! own imports (Python `from X import Y, Z`, TS +//! `import { a, b } from 'x'`). Used by R2 to know what +//! downstream files depend on. +//! +//! Caller is responsible for caching; the supported entry point is +//! `ParsedFile::public_symbols()` / `ParsedFile::imported_symbols()`. + +use std::collections::HashSet; + +use crate::ast::parsed_file::ParsedFile; + +pub fn extract_public_symbols(parsed: &ParsedFile<'_>) -> HashSet { + let mut out = HashSet::new(); + walk_public(parsed.root_node(), parsed.source_bytes(), &mut out); + out +} + +pub fn extract_imported_symbols(parsed: &ParsedFile<'_>) -> HashSet { + let mut out = HashSet::new(); + walk_imported(parsed.root_node(), parsed.source_bytes(), &mut out); + out +} + +fn walk_public(node: tree_sitter::Node<'_>, src: &[u8], out: &mut HashSet) { + let kind = node.kind(); + let is_decl = matches!( + kind, + "function_definition" | "function_declaration" | "function_item" + | "class_definition" | "class_declaration" + | "method_definition" | "method_declaration" + | "interface_declaration" | "enum_declaration" + | "struct_item" | "trait_item" | "type_alias" + | "lexical_declaration" | "variable_declaration" + | "export_statement" + ); + if is_decl { + if let Some(name_node) = node.child_by_field_name("name") { + if let Ok(name) = name_node.utf8_text(src) { + if is_public_name(name) && is_likely_public(node, src) { + out.insert(name.to_string()); + } + } + } else if kind == "export_statement" { + walk_export(node, src, out); + } + } + // Recurse into children, but skip function bodies — nested + // local helpers are not part of the file's public API. + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if matches!( + kind, + "function_definition" | "function_item" | "method_definition" + ) && matches!( + child.kind(), + "block" | "function_body" | "compound_statement" + ) { + continue; + } + walk_public(child, src, out); + } +} + +fn is_public_name(name: &str) -> bool { + // Python convention: _-prefixed = private. + !name.starts_with('_') +} + +fn is_likely_public(node: tree_sitter::Node<'_>, src: &[u8]) -> bool { + // Rust: must have `pub` modifier. + if node.kind() == "function_item" + || node.kind() == "struct_item" + || node.kind() == "trait_item" + { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if child.kind() == "visibility_modifier" { + if let Ok(text) = child.utf8_text(src) { + if text.starts_with("pub") { + return true; + } + } + } + } + return false; + } + true +} + +fn walk_export(node: tree_sitter::Node<'_>, src: &[u8], out: &mut HashSet) { + // Mark `export default` so callers can detect its loss. + // The synthetic name "default" is what TS module consumers + // import via `import Foo from './x'` — the local name on the + // right is the consumer's choice, but the export slot's + // identity is "default". + let raw = node.utf8_text(src).unwrap_or(""); + if raw.trim_start().starts_with("export default") { + out.insert("default".to_string()); + } + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if let Some(name_node) = child.child_by_field_name("name") { + if let Ok(name) = name_node.utf8_text(src) { + if is_public_name(name) { + out.insert(name.to_string()); + } + } + } + // For `export { a, b as c } from 'x'` — collect identifiers + // inside export_clause / export_specifier nodes. + if matches!(child.kind(), "export_specifier") { + if let Some(name_node) = child.child_by_field_name("name") { + if let Ok(name) = name_node.utf8_text(src) { + out.insert(name.to_string()); + } + } else if let Some(first) = child.named_child(0) { + if let Ok(name) = first.utf8_text(src) { + out.insert(name.to_string()); + } + } + } + walk_export(child, src, out); + } +} + +fn walk_imported(node: tree_sitter::Node<'_>, src: &[u8], out: &mut HashSet) { + let kind = node.kind(); + // Python: from X import Y, Z [as W] + if kind == "import_from_statement" { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match child.kind() { + "dotted_name" => { + // Skip the module_name field — only collect the + // imported names that follow. + if node.child_by_field_name("module_name") == Some(child) { + continue; + } + if let Ok(text) = child.utf8_text(src) { + out.insert(text.to_string()); + } + } + "aliased_import" => { + if let Some(name_node) = child.named_child(0) { + if let Ok(text) = name_node.utf8_text(src) { + out.insert(text.to_string()); + } + } + } + _ => {} + } + } + } + // TS/JS: import { x, y as z } from 'X' + if kind == "import_specifier" { + if let Some(name) = node.child_by_field_name("name") { + if let Ok(text) = name.utf8_text(src) { + out.insert(text.to_string()); + } + } else if let Some(first) = node.named_child(0) { + if let Ok(text) = first.utf8_text(src) { + out.insert(text.to_string()); + } + } + } + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + walk_imported(child, src, out); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ast::parsed_file::parse; + + #[test] + fn python_public_symbols() { + let pf = parse( + "lib.py", + "def public_fn():\n pass\n\nclass Public:\n pass\n\ndef _private():\n pass\n", + ).unwrap(); + let syms = extract_public_symbols(&pf); + assert!(syms.contains("public_fn")); + assert!(syms.contains("Public")); + assert!(!syms.contains("_private")); + } + + #[test] + fn rust_pub_only() { + let pf = parse( + "lib.rs", + "pub fn exposed() {}\nfn hidden() {}\npub struct Pubst;\nstruct Privst;\n", + ).unwrap(); + let syms = extract_public_symbols(&pf); + assert!(syms.contains("exposed")); + assert!(!syms.contains("hidden")); + assert!(syms.contains("Pubst")); + assert!(!syms.contains("Privst")); + } + + #[test] + fn python_imported_names() { + let pf = parse( + "a.py", + "from collections import OrderedDict, defaultdict\nfrom os.path import join as joinpath\n", + ).unwrap(); + let syms = extract_imported_symbols(&pf); + assert!(syms.contains("OrderedDict")); + assert!(syms.contains("defaultdict")); + assert!(syms.contains("join")); + } +} diff --git a/crates/aegis-core/src/workspace.rs b/crates/aegis-core/src/workspace.rs index 69ca9d2..55dc879 100644 --- a/crates/aegis-core/src/workspace.rs +++ b/crates/aegis-core/src/workspace.rs @@ -357,14 +357,12 @@ pub fn summarize_file(path: &Path, parsed: &ParsedFile<'_>) -> FileSummary { .map(|i| i.module.clone()) .collect(); - // Public symbols: top-level function / class / const declarations. - // Heuristic per language; missing the long tail is fine — Ring R2 - // is a delta check, so consistency matters more than completeness. - let mut public_symbols = HashSet::new(); - extract_public_symbols(root, src, &mut public_symbols); - - let mut imported_symbols = HashSet::new(); - extract_imported_symbols(root, src, &mut imported_symbols); + // Public symbols + imported symbols: read from Layer 1 caches. + // workspace.rs used to walk the tree itself with local copies of + // the extraction logic; that work moved into ast::symbols so it + // runs at most once per file across all consumers. + let public_symbols = parsed.public_symbols().clone(); + let imported_symbols = parsed.imported_symbols().clone(); // S7.7 — populate per-file numeric signals so project z-scores // can be computed without re-parsing each file when stats are @@ -401,156 +399,6 @@ pub fn summarize_file(path: &Path, parsed: &ParsedFile<'_>) -> FileSummary { } } -/// Walk the AST and collect names brought in via `from X import Y` -/// or `import { y, z } from 'X'` patterns. Best-effort. -fn extract_imported_symbols( - node: tree_sitter::Node, - src: &[u8], - out: &mut HashSet, -) { - let kind = node.kind(); - // Python: from X import Y, Z [as W] - if kind == "import_from_statement" { - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - match child.kind() { - "dotted_name" => { - // Skip the module_name field — only collect the - // imported names that follow. - if node.child_by_field_name("module_name") == Some(child) { - continue; - } - if let Ok(text) = child.utf8_text(src) { - out.insert(text.to_string()); - } - } - "aliased_import" => { - if let Some(name_node) = child.named_child(0) { - if let Ok(text) = name_node.utf8_text(src) { - out.insert(text.to_string()); - } - } - } - _ => {} - } - } - } - // TS/JS: import { x, y as z } from 'X' - if kind == "import_specifier" { - if let Some(name) = node.child_by_field_name("name") { - if let Ok(text) = name.utf8_text(src) { - out.insert(text.to_string()); - } - } else if let Some(first) = node.named_child(0) { - if let Ok(text) = first.utf8_text(src) { - out.insert(text.to_string()); - } - } - } - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - extract_imported_symbols(child, src, out); - } -} - -fn extract_public_symbols( - node: tree_sitter::Node, - src: &[u8], - out: &mut HashSet, -) { - let kind = node.kind(); - let is_decl = matches!( - kind, - "function_definition" | "function_declaration" | "function_item" - | "class_definition" | "class_declaration" - | "method_definition" | "method_declaration" - | "interface_declaration" | "enum_declaration" - | "struct_item" | "trait_item" | "type_alias" - | "lexical_declaration" | "variable_declaration" - | "export_statement" - ); - if is_decl { - if let Some(name_node) = node.child_by_field_name("name") { - if let Ok(name) = name_node.utf8_text(src) { - if is_public_name(name) && is_likely_public(node, src) { - out.insert(name.to_string()); - } - } - } else if kind == "export_statement" { - walk_export(node, src, out); - } - } - // Recurse into children, but skip function bodies — nested - // local helpers are not part of the file's public API. - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - if matches!(kind, "function_definition" | "function_item" | "method_definition") - && matches!(child.kind(), "block" | "function_body" | "compound_statement") - { - continue; - } - extract_public_symbols(child, src, out); - } -} - -fn is_public_name(name: &str) -> bool { - // Python convention: _-prefixed = private. - !name.starts_with('_') -} - -fn is_likely_public(node: tree_sitter::Node, src: &[u8]) -> bool { - // Rust: must have `pub` modifier. - if node.kind() == "function_item" || node.kind() == "struct_item" || node.kind() == "trait_item" { - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - if child.kind() == "visibility_modifier" { - if let Ok(text) = child.utf8_text(src) { - if text.starts_with("pub") { - return true; - } - } - } - } - return false; - } - true -} - -fn walk_export(node: tree_sitter::Node, src: &[u8], out: &mut HashSet) { - // S1.8: explicitly mark `export default` exports so callers can - // detect their loss. The synthetic name "default" is what TS - // module consumers import via `import Foo from './x'` — the - // local name on the right is the consumer's choice, but the - // export slot's identity is "default". - let raw = node.utf8_text(src).unwrap_or(""); - if raw.trim_start().starts_with("export default") { - out.insert("default".to_string()); - } - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - if let Some(name_node) = child.child_by_field_name("name") { - if let Ok(name) = name_node.utf8_text(src) { - if is_public_name(name) { - out.insert(name.to_string()); - } - } - } - // For `export { a, b as c } from 'x'` — collect identifiers - // inside export_clause / export_specifier nodes. - if matches!(child.kind(), "export_specifier") { - if let Some(name_node) = child.child_by_field_name("name") { - if let Ok(name) = name_node.utf8_text(src) { - out.insert(name.to_string()); - } - } else if let Some(first) = child.named_child(0) { - if let Ok(name) = first.utf8_text(src) { - out.insert(name.to_string()); - } - } - } - walk_export(child, src, out); - } -} /// Lost public symbols: names that were public in `before` and gone /// from `after`. Returns the names so the agent can see *what* it