From c45bdd30b003d95ac903344e6638473cab2b43ad Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 11:43:17 +0200 Subject: [PATCH 01/12] fix(kernel): emit 1-based UTF-16 code-unit columns from tree-sitter nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduced `LineColumnIndex` in `crates/common/src/utils/position_utils.rs`, a pre-built per-source index that converts tree-sitter's 0-based UTF-8 byte columns to 1-based UTF-16 code-unit columns, with an ASCII fast path and full unit-test coverage (BMP, supplementary-plane emoji, CJK, combining marks, CRLF, empty lines). - Documented `Position.col` as a 1-based UTF-16 code-unit column in `crates/common/src/model/position.rs` to lock in the contract. - Changed `map_node` and `get_query_nodes` (`tree_sitter.rs`) to accept and use `&LineColumnIndex`; updated `process_tree_sitter_tree_request` (`tree_sitter_tree.rs`) accordingly. - Replaced `TreeSitterNode::from_ts_node` with `from_ts_node_with_index`, threading `LineColumnIndex` through the full ddsa call chain: `JsRuntime::execute_rule_internal` β†’ `QueryMatchBridge::set_data` β†’ `TsNodeBridge::insert`/`insert_capture`/`build_v8_node`. - Updated the two taint-flow region builders (`LocatedNode::new_cst` in `flow/graph.rs`, `position_eq` helper in `flow/java.rs`) and the deno ops (`op_ts_node_named_children`, `op_ts_node_parent`, `op_digraph_adjacency_list_to_dot`) to use the same helper. - Moved the `ctx_bridge` borrow in `op_ts_node_named_children` into the `else` branch where it is actually needed. - Added multibyte regression tests across all layers: `LineColumnIndex` unit matrix, `map_node`/`get_query_nodes` emoji+CJK tests, `TsNodeBridge` bridge test, `LocatedNode` regression test, `stella_compat::getCode` JS execution test, server integration test on base64-encoded multibyte Python, and SARIF/CSV serialization assertions. IDE-6037 --- crates/cli/src/csv.rs | 35 ++++ crates/cli/src/sarif/sarif_utils.rs | 84 +++++++++ crates/common/src/model/position.rs | 12 ++ crates/common/src/utils/position_utils.rs | 176 ++++++++++++++++++ .../analysis/ddsa_lib/bridge/query_match.rs | 23 ++- .../src/analysis/ddsa_lib/bridge/ts_node.rs | 98 +++++++--- .../src/analysis/ddsa_lib/js/flow/graph.rs | 71 ++++++- .../ddsa_lib/js/flow/graph_test_utils.rs | 2 +- .../src/analysis/ddsa_lib/js/flow/java.rs | 29 ++- .../src/analysis/ddsa_lib/js/ts_node.rs | 105 +++++++++-- .../src/analysis/ddsa_lib/ops.rs | 19 +- .../src/analysis/ddsa_lib/runtime.rs | 44 ++++- .../src/analysis/ddsa_lib/test_utils.rs | 5 + .../src/analysis/tree_sitter.rs | 146 +++++++++++++-- .../src/tree_sitter_tree.rs | 51 ++++- 15 files changed, 819 insertions(+), 81 deletions(-) diff --git a/crates/cli/src/csv.rs b/crates/cli/src/csv.rs index b3da79d4f..8b60c5a6a 100644 --- a/crates/cli/src/csv.rs +++ b/crates/cli/src/csv.rs @@ -97,4 +97,39 @@ mod tests { ); assert_eq!(res_with_result, "filename,rule,category,severity,message,start_line,start_col,end_line,end_col\nfilename,myrule,performance,error,message,10,12,12,10\n"); } + + /// The CSV output faithfully serializes whatever col value is stored in the violation. + /// When the kernel produces UTF-16 columns (e.g. col 3 for a node after "πŸš€"), the CSV row + /// contains 3, not the raw byte col (5). + #[test] + fn test_export_csv_multibyte_col() { + let res = generate_csv_results( + &vec![RuleResult { + rule_name: "myrule".to_string(), + filename: "file.py".to_string(), + violations: vec![Violation { + // Simulates a violation whose `col` was computed as UTF-16 (e.g. after πŸš€). + start: common::model::position::Position { line: 1, col: 3 }, + end: common::model::position::Position { line: 1, col: 16 }, + message: "multibyte".to_string(), + severity: RuleSeverity::Error, + category: RuleCategory::Security, + fixes: vec![], + taint_flow: None, + is_suppressed: false, + }], + errors: vec![], + execution_error: None, + output: None, + execution_time_ms: 1, + query_node_time_ms: 0, + parsing_time_ms: 0, + }], + &vec![], + ); + assert!( + res.contains(",1,3,1,16\n"), + "CSV should contain UTF-16 col 3; got: {res}" + ); + } } diff --git a/crates/cli/src/sarif/sarif_utils.rs b/crates/cli/src/sarif/sarif_utils.rs index 22cf71f19..1bc844511 100644 --- a/crates/cli/src/sarif/sarif_utils.rs +++ b/crates/cli/src/sarif/sarif_utils.rs @@ -2231,4 +2231,88 @@ mod tests { let sarif_json = serde_json::to_value(sarif_report).unwrap(); assert!(validate_data(&sarif_json)); } + + /// The SARIF `region.startColumn` / `region.endColumn` fields carry whatever value is in + /// `Position.col`. When the kernel emits UTF-16 columns (as it now does), the SARIF output + /// automatically conforms to the SARIF v2.1 default encoding (UTF-16). + /// + /// This test constructs a violation whose col was computed as UTF-16 (e.g. after an emoji on + /// the same line, col 3) and asserts the SARIF JSON contains that value. + #[test] + fn sarif_region_carries_utf16_col() { + let rule = RuleBuilder::default() + .name("utf16-col-rule".to_string()) + .description_base64(None) + .language(Language::Python) + .checksum("x".to_string()) + .pattern(None) + .tree_sitter_query_base64(None) + .category(RuleCategory::Security) + .code_base64(String::new()) + .short_description_base64(None) + .entity_checked(None) + .rule_type(RuleType::TreeSitterQuery) + .severity(RuleSeverity::Error) + .cwe(None) + .arguments(vec![]) + .tests(vec![]) + .is_testing(false) + .documentation_url(None) + .build() + .unwrap(); + let sarif_rule = SarifRule::StaticAnalysis(Box::new(rule)); + + // UTF-16 col 3 = position of an identifier after πŸš€ on the same line. + let violation = Violation { + start: Position::new(1, 3), + end: Position::new(1, 16), + message: "utf16 col test".to_string(), + severity: RuleSeverity::Error, + category: RuleCategory::Security, + fixes: vec![], + taint_flow: None, + is_suppressed: false, + }; + let rule_result = RuleResultBuilder::default() + .rule_name("utf16-col-rule".to_string()) + .filename("file.py".to_string()) + .violations(vec![violation]) + .errors(vec![]) + .execution_error(None) + .output(None) + .execution_time_ms(0u128) + .parsing_time_ms(0u128) + .query_node_time_ms(0u128) + .build() + .unwrap(); + let sarif_result = SarifRuleResult::try_from(rule_result).unwrap(); + + let directory = ".".to_string(); + let sarif = generate_sarif_report( + &[sarif_rule], + &[sarif_result], + &directory, + SarifReportMetadata { + add_git_info: false, + debug: false, + config_digest: String::new(), + diff_aware_parameters: None, + execution_time_secs: 0, + }, + &Default::default(), + ) + .expect("generate sarif"); + + let sarif_json = serde_json::to_value(&sarif).unwrap(); + let location = + &sarif_json["runs"][0]["results"][0]["locations"][0]["physicalLocation"]["region"]; + assert_eq!( + location["startColumn"], 3, + "SARIF startColumn should be UTF-16 col 3" + ); + assert_eq!( + location["endColumn"], 16, + "SARIF endColumn should be UTF-16 col 16" + ); + } } diff --git a/crates/common/src/model/position.rs b/crates/common/src/model/position.rs index d7aa3d351..0d85c26db 100644 --- a/crates/common/src/model/position.rs +++ b/crates/common/src/model/position.rs @@ -6,6 +6,18 @@ use derive_builder::Builder; use serde::{Deserialize, Serialize}; use std::fmt; +/// A source-code position. +/// +/// * `line` β€” 1-based line number. +/// * `col` β€” 1-based **UTF-16 code-unit** column number for every position produced by the +/// static-analysis kernel (tree-sitter path). On ASCII-only lines, one UTF-16 code unit equals +/// one byte, so the value is identical to what a byte-column would be. For non-ASCII characters +/// (CJK ideographs, emoji surrogate pairs, combining marks, etc.) the value reflects UTF-16 +/// semantics, which matches LSP / VS Code and the SARIF v2.1 default encoding. +/// +/// The `get_position_in_string` helper (used by the secrets scanner) emits grapheme-based +/// columns under a different contract; that path does **not** flow through `Position.col` in the +/// kernel output. #[derive(Deserialize, Debug, Serialize, Clone, Copy, Builder, PartialEq, Eq, Hash)] pub struct Position { pub line: u32, diff --git a/crates/common/src/utils/position_utils.rs b/crates/common/src/utils/position_utils.rs index 4458abfca..6e85dfff0 100644 --- a/crates/common/src/utils/position_utils.rs +++ b/crates/common/src/utils/position_utils.rs @@ -2,6 +2,70 @@ use crate::model::position::Position; use bstr::BStr; use bstr::ByteSlice; +/// Precomputed per-line index for fast repeated UTF-8 byte-column β†’ UTF-16 code-unit column +/// conversion. +/// +/// Build once per source string with [`LineColumnIndex::new`], then call +/// [`byte_col_to_utf16_col`] for every tree-sitter node on that source. +pub struct LineColumnIndex<'a> { + source: &'a str, + /// Byte offset of the start of each line (0-indexed line number β†’ byte offset of first byte + /// on that line). + line_starts: Vec, +} + +impl<'a> LineColumnIndex<'a> { + /// Builds the index by scanning `source` for newline characters. + pub fn new(source: &'a str) -> Self { + let mut line_starts = vec![0usize]; + for (i, b) in source.bytes().enumerate() { + // Intentionally split on `\n` only β€” this mirrors tree-sitter's line model exactly. + // Tree-sitter does not treat bare `\r` (old Mac OS 9) as a line terminator; for + // Windows `\r\n` files the `\r` is part of the column on the same line, matching + // tree-sitter's `Point.column` values. Using a broader splitter (e.g. `str::lines`) + // would diverge from tree-sitter and produce wrong UTF-16 columns. + if b == b'\n' { + line_starts.push(i + 1); + } + } + Self { + source, + line_starts, + } + } + + /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit + /// column. + /// + /// **ASCII fast path**: when the line prefix up to `byte_col` is pure ASCII (all bytes + /// < 128), each byte is one UTF-16 code unit, so the result is simply `byte_col + 1`. + /// This path costs one slice-level `is_ascii()` check and is taken for the vast majority of + /// real-world source lines. + pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> u32 { + let line_start = self + .line_starts + .get(row) + .copied() + .unwrap_or(self.source.len()); + // Slice from the line start to end-of-source (unbounded); the subsequent `prefix` slice + // clamps this to exactly `byte_col` bytes, so the trailing lines are never examined. + let source_from_line_start = &self.source.as_bytes()[line_start..]; + // `byte_col` is the number of bytes from the line start to the position. + let prefix_len = byte_col.min(source_from_line_start.len()); + let prefix = &source_from_line_start[..prefix_len]; + + // ASCII fast path: one byte == one UTF-16 code unit. + if prefix.is_ascii() { + return (byte_col + 1) as u32; + } + + // Slow path: count UTF-16 code units emitted by each Unicode scalar. + let prefix_str = std::str::from_utf8(prefix).unwrap_or(""); + let utf16_col: usize = prefix_str.chars().map(|c| c.len_utf16()).sum(); + (utf16_col + 1) as u32 + } +} + /// Get position of an offset in a code and return a [Position]. pub fn get_position_in_string(content: &str, offset: usize) -> anyhow::Result { if offset >= content.len() { @@ -137,4 +201,116 @@ mod tests { Position::new(3, 13) ); } + + // ── LineColumnIndex tests ───────────────────────────────────────────────── + + #[test] + fn lci_ascii_fast_path() { + // Pure ASCII: UTF-16 col == byte col + 1 for every position. + let idx = LineColumnIndex::new("hello world\nfoo bar"); + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); // 'h' β†’ col 1 + assert_eq!(idx.byte_col_to_utf16_col(0, 5), 6); // ' ' β†’ col 6 + assert_eq!(idx.byte_col_to_utf16_col(1, 3), 4); // ' ' on line 2 β†’ col 4 + } + + #[test] + fn lci_bmp_non_ascii() { + // 'Γ©' is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit. + // Source: "cafΓ©\nend" β€” bytes: c(0) a(1) f(2) Γ©(3,4) \n(5) + let src = "caf\u{00E9}\nend"; + let idx = LineColumnIndex::new(src); + // byte_col 3 (start of Γ©) β†’ prefix "caf" = 3 UTF-16 units β†’ col 4 + assert_eq!(idx.byte_col_to_utf16_col(0, 3), 4); + // byte_col 5 (past Γ©, 3 + 2 bytes) β†’ prefix "cafΓ©" = 4 UTF-16 units β†’ col 5 + assert_eq!(idx.byte_col_to_utf16_col(0, 5), 5); + // line 1 is ASCII + assert_eq!(idx.byte_col_to_utf16_col(1, 3), 4); + } + + #[test] + fn lci_supplementary_plane_emoji() { + // 'πŸš€' is U+1F680: 4 UTF-8 bytes, 2 UTF-16 code units (surrogate pair). + // Source: "xπŸš€y" + // byte positions: x(0) πŸš€(1,2,3,4) y(5) + let src = "x\u{1F680}y"; + let idx = LineColumnIndex::new(src); + // byte_col 0 β†’ col 1 (before 'x') + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + // byte_col 1 β†’ prefix = "x" β†’ 1 UTF-16 unit β†’ col 2 + assert_eq!(idx.byte_col_to_utf16_col(0, 1), 2); + // byte_col 5 β†’ prefix = "xπŸš€" β†’ x(1) + πŸš€(2) = 3 UTF-16 units β†’ col 4 + assert_eq!(idx.byte_col_to_utf16_col(0, 5), 4); + // byte_col 6 β†’ prefix = "xπŸš€y" β†’ 3+1 = 4 UTF-16 units β†’ col 5 + assert_eq!(idx.byte_col_to_utf16_col(0, 6), 5); + } + + #[test] + fn lci_cjk() { + // 'ζ—₯' is U+65E5: 3 UTF-8 bytes, 1 UTF-16 code unit. + // Source: "ζ—₯本" + // bytes: ζ—₯(0,1,2) 本(3,4,5) + let src = "\u{65E5}\u{672C}"; // ζ—₯本 + let idx = LineColumnIndex::new(src); + // byte_col 0 β†’ col 1 + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + // byte_col 3 β†’ prefix = "ζ—₯" β†’ 1 UTF-16 unit β†’ col 2 + assert_eq!(idx.byte_col_to_utf16_col(0, 3), 2); + // byte_col 6 β†’ prefix = "ζ—₯本" β†’ 2 UTF-16 units β†’ col 3 + assert_eq!(idx.byte_col_to_utf16_col(0, 6), 3); + } + + #[test] + fn lci_combining_mark() { + // 'e' + U+0301 (combining acute accent): 2 codepoints, 2 UTF-16 code units, 1 grapheme. + // Source: "e\u{0301}x" + // bytes: e(0) \u{0301}(1,2) x(3) + let src = "e\u{0301}x"; + let idx = LineColumnIndex::new(src); + // byte_col 0 β†’ col 1 + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + // byte_col 1 β†’ prefix = "e" β†’ 1 UTF-16 unit β†’ col 2 + assert_eq!(idx.byte_col_to_utf16_col(0, 1), 2); + // byte_col 3 β†’ prefix = "e\u{0301}" β†’ 2 UTF-16 units β†’ col 3 + assert_eq!(idx.byte_col_to_utf16_col(0, 3), 3); + // byte_col 4 β†’ prefix = "e\u{0301}x" β†’ 3 UTF-16 units β†’ col 4 + assert_eq!(idx.byte_col_to_utf16_col(0, 4), 4); + } + + #[test] + fn lci_crlf_line_endings() { + // CRLF: tree-sitter counts \n as the line terminator and the col includes \r. + // Source: "ab\r\ncd" + // Line 0 bytes: a(0) b(1) \r(2) \n(3) β†’ line_start[1] = 4 + // Line 1 bytes: c(0 rel) d(1 rel) + let src = "ab\r\ncd"; + let idx = LineColumnIndex::new(src); + // Line 0 + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(0, 2), 3); // past "ab" β†’ 2+1=3 + // Line 1 + assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(1, 2), 3); + } + + #[test] + fn lci_eol_boundary() { + // byte_col == 0 on the start of any line β†’ col 1. + let src = "abc\ndef\nghi"; + let idx = LineColumnIndex::new(src); + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(2, 0), 1); + // byte_col at end of "abc" (3 bytes) β†’ col 4 + assert_eq!(idx.byte_col_to_utf16_col(0, 3), 4); + } + + #[test] + fn lci_empty_line() { + // An empty line: byte_col 0 β†’ col 1. + let src = "\n\nfoo"; + let idx = LineColumnIndex::new(src); + assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); + assert_eq!(idx.byte_col_to_utf16_col(2, 0), 1); + } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs index c41e1e236..24054f3e7 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs @@ -7,6 +7,7 @@ use crate::analysis::ddsa_lib::common::{Class, DDSAJsRuntimeError, NodeId, Stell use crate::analysis::ddsa_lib::js; use crate::analysis::ddsa_lib::v8_ds::MirroredVec; use crate::analysis::tree_sitter::QueryMatch; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; @@ -33,6 +34,10 @@ impl QueryMatchBridge { /// Sets the bridge's data to the list of [`QueryMatch`]es, inserting tree-sitter nodes /// into the provided `TsNodeBridge`. /// + /// `idx` must be built from the same source string that produced the tree-sitter nodes in + /// `matches`; it is forwarded to [`TsNodeBridge::insert_capture`] so that `col` values + /// stored in v8 are 1-based UTF-16 code units. + /// /// NOTE: if the bridge had existing `QueryMatch`es, the tree-sitter nodes associated with them /// will not be removed from the `TsNodeBridge`. pub fn set_data<'tree>( @@ -40,6 +45,7 @@ impl QueryMatchBridge { scope: &mut HandleScope, matches: impl Into>>>, node_bridge: &mut TsNodeBridge, + idx: &LineColumnIndex<'_>, ) { let matches = matches.into(); // Pass each node in via the bridge (assigning it an id), and use this id to transform @@ -49,7 +55,7 @@ impl QueryMatchBridge { .map(|q_match| { q_match .into_iter() - .map(|capture| node_bridge.insert_capture(scope, capture)) + .map(|capture| node_bridge.insert_capture(scope, capture, idx)) .collect::>() .into() }) @@ -89,6 +95,7 @@ mod tests { use crate::analysis::ddsa_lib::v8_ds::MirroredVec; use crate::analysis::tree_sitter::{get_tree, QueryMatch, TSCaptureContent, TSQuery}; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::JsRuntime; fn setup_bridge() -> (JsRuntime, QueryMatchBridge, TsNodeBridge) { @@ -125,9 +132,10 @@ const ghi = 'hello' + ' world'; "; let query = TSQuery::try_new(&tree.language(), query).unwrap(); let matches = query.cursor().matches(tree.root_node(), text, None); + let idx = LineColumnIndex::new(text); assert!(query_match_bridge.is_empty()); assert!(ts_node_bridge.is_empty()); - query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge); + query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge, &idx); assert_eq!(query_match_bridge.len(), 3); assert_eq!(ts_node_bridge.len(), 3); @@ -139,16 +147,17 @@ const ghi = 'hello' + ' world'; } // The `QueryMatchBridge` doesn't clear nodes from `TsNodeBridge` when values change. - query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge); + query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge, &idx); assert_eq!(query_match_bridge.len(), 2); assert_eq!(ts_node_bridge.len(), 3); - let text = "\ + let text2 = "\ // Arbitrary JavaScript that contains `identifier` CST nodes const alpha = 'bravo'; "; - let tree = get_tree(text, &Language::JavaScript).unwrap(); - let matches = query.cursor().matches(tree.root_node(), text, None); - query_match_bridge.set_data(scope, matches, &mut ts_node_bridge); + let tree2 = get_tree(text2, &Language::JavaScript).unwrap(); + let matches2 = query.cursor().matches(tree2.root_node(), text2, None); + let idx2 = LineColumnIndex::new(text2); + query_match_bridge.set_data(scope, matches2, &mut ts_node_bridge, &idx2); assert_eq!(get_node_id_at_idx(&query_match_bridge, 0), 3); assert_eq!(ts_node_bridge.len(), 4); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs index d2740856d..2f17fa07b 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs @@ -7,6 +7,7 @@ use crate::analysis::ddsa_lib::js::TreeSitterNode; use crate::analysis::ddsa_lib::v8_ds::MirroredIndexMap; use crate::analysis::ddsa_lib::{js, RawTSNode}; use crate::analysis::tree_sitter::{TSCaptureContent, TSQueryCapture}; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; @@ -39,7 +40,16 @@ impl TsNodeBridge { /// Inserts a tree-sitter node into the bridge, returning the `NodeId` it was assigned. /// /// If the node already existed in the bridge, the existing `NodeId` will be returned. - pub fn insert(&mut self, scope: &mut HandleScope, node: tree_sitter::Node) -> NodeId { + /// + /// `idx` must be built from the same source string that produced `node`; it is forwarded to + /// [`TreeSitterNode::from_ts_node_with_index`] so that `col` values are 1-based UTF-16 + /// code units. + pub fn insert( + &mut self, + scope: &mut HandleScope, + node: tree_sitter::Node, + idx: &LineColumnIndex<'_>, + ) -> NodeId { let raw_ts_node = RawTSNode::new(node); if let Some((_, _, node_id)) = self.mirrored_im.get_full(&raw_ts_node) { *node_id @@ -48,7 +58,7 @@ impl TsNodeBridge { // to the map's current length. Because we use this as the `NodeId`, we can pre-assign // this, knowing this invariant (index == NodeId) will hold. let node_id = self.mirrored_im.len() as NodeId; - let v8_node = self.build_v8_node(scope, node, node_id); + let v8_node = self.build_v8_node(scope, node, node_id, idx); self.mirrored_im .insert_with(scope, raw_ts_node, node_id, |scope, _key, value| { // In Rust, we map from `tree_sitter::Node` to `NodeId`. @@ -76,14 +86,15 @@ impl TsNodeBridge { .map(|(raw_node, _)| raw_node) } - /// Serializes a [`tree_sitter::Node`] to v8. + /// Serializes a [`tree_sitter::Node`] to v8 using UTF-16 column values from `idx`. fn build_v8_node<'s>( &self, scope: &mut HandleScope<'s>, node: tree_sitter::Node, id: NodeId, + idx: &LineColumnIndex<'_>, ) -> v8::Local<'s, v8::Object> { - let ts_node = TreeSitterNode::::from_ts_node(id, node); + let ts_node = TreeSitterNode::::from_ts_node_with_index(id, node, idx); self.js_class.new_instance(scope, ts_node) } @@ -108,24 +119,28 @@ impl TsNodeBridge { self.mirrored_im.as_local(scope) } - /// Inserts the nodes from a [`TSQueryCapture`] into a v8 scope, consuming the `QueryCapture`. - /// Returns a transformed `QueryCapture` containing the ids of the inserted nodes. + /// Inserts the nodes from a [`TSQueryCapture`] into a v8 scope, consuming + /// the `QueryCapture`. Returns a transformed `QueryCapture` containing the ids of the inserted + /// nodes. + /// + /// `idx` is forwarded to [`insert`](Self::insert) for UTF-16 column conversion. pub fn insert_capture( &mut self, scope: &mut HandleScope, capture: TSQueryCapture, + idx: &LineColumnIndex<'_>, ) -> TSQueryCapture { TSQueryCapture:: { name: capture.name, contents: match capture.contents { TSCaptureContent::Single(node) => { - let nid = self.insert(scope, node); + let nid = self.insert(scope, node, idx); TSCaptureContent::Single(nid) } TSCaptureContent::Multi(nodes) => { let nids = nodes .into_iter() - .map(|node| self.insert(scope, node)) + .map(|node| self.insert(scope, node, idx)) .collect::>(); TSCaptureContent::Multi(nids) } @@ -158,6 +173,7 @@ mod tests { }; use crate::analysis::ddsa_lib::RawTSNode; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::cell::RefCell; @@ -205,22 +221,24 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; let bar = tree.find_named_nodes(Some("bar"), None)[0]; let baz = tree.find_named_nodes(Some("baz"), None)[0]; assert!(bridge.v8_get(scope, 0).is_none()); - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); let v8_tsn = bridge.v8_get(scope, 0).unwrap(); assert!(ts_node_eq(scope, v8_tsn, foo)); assert!(bridge.v8_get(scope, 1).is_none()); - assert_eq!(bridge.insert(scope, bar), 1); - assert_eq!(bridge.insert(scope, baz), 2); + assert_eq!(bridge.insert(scope, bar, &idx), 1); + assert_eq!(bridge.insert(scope, baz, &idx), 2); bridge.clear(scope); assert!(bridge.is_empty()); - assert_eq!(bridge.insert(scope, baz), 0); + assert_eq!(bridge.insert(scope, baz, &idx), 0); let v8_tsn = bridge.v8_get(scope, 0).unwrap(); assert!(ts_node_eq(scope, v8_tsn, baz)); } @@ -232,13 +250,15 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; let bar = tree.find_named_nodes(Some("bar"), None)[0]; let baz = tree.find_named_nodes(Some("baz"), None)[0]; for node in [foo, bar, baz] { - bridge.insert(scope, node); + bridge.insert(scope, node, &idx); } let bar_raw = RawTSNode::new(bar); @@ -252,16 +272,50 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); assert!(bridge.v8_get(scope, 0).is_some()); assert!(bridge.v8_get(scope, 1).is_none()); - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); assert!(bridge.v8_get(scope, 1).is_none()); } + /// Inserting a node from a multibyte source produces UTF-16 columns in the v8 object. + /// + /// Source: `"\u{1F680}"; abc = 1` β€” πŸš€ is 4 UTF-8 bytes / 2 UTF-16 code units inside a string. + /// `abc` starts at byte 8 (after `"πŸš€"; `), and its UTF-16 col must account for the emoji. + /// + /// Byte layout: `"` `πŸš€`(4 bytes) `"` `;` ` ` = 8 bytes β†’ `abc` at byte 8. + /// UTF-16 prefix: `"` + πŸš€(2 units) + `"` + `;` + ` ` = 6 units β†’ col 7. + #[test] + fn ts_node_bridge_multibyte_utf16_col() { + let (mut runtime, bridge) = setup_bridge(); + let scope = &mut runtime.handle_scope(); + let mut bridge = bridge.borrow_mut(); + + // Parse `"\u{1F680}"; abc = 1` as JavaScript. + let source = "\"\u{1F680}\"; abc = 1"; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); + let abc = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; + // "πŸš€" (6 bytes: quote+4+quote) + "; " (2 bytes) = 8 bytes before `abc`. + assert_eq!(abc.start_position().column, 8, "byte col of `abc` is 8"); + + bridge.insert(scope, abc, &idx); + let v8_obj = bridge.v8_get(scope, 0).unwrap(); + + // Read `_startCol` from the v8 object. + let start_col = get_field::(v8_obj, "_startCol", scope, "integer") + .unwrap() + .value() as u32; + // UTF-16 units before abc: `"` (1) + πŸš€ (2 surrogate units) + `"` (1) + `;` (1) + ` ` (1) = 6 β†’ col 7. + assert_eq!(start_col, 7, "UTF-16 col of `abc` after πŸš€ should be 7"); + } + /// The text that the node spans can be retrieved. #[test] fn get_node_text() { @@ -271,6 +325,7 @@ const abc = 123; const def = 456; "; let tree = TsTree::new(file_contents, Language::JavaScript); + let idx = LineColumnIndex::new(file_contents); let file_contents = Arc::::from(file_contents); let file_name = Arc::::from("file_name.js"); @@ -284,8 +339,8 @@ const def = 456; .set_root_context(scope, &tree.tree(), &file_contents, &file_name); let node_0 = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; let node_1 = tree.find_named_nodes(Some("456"), Some("number"))[0]; - ts_node_bridge.borrow_mut().insert(scope, node_0); - ts_node_bridge.borrow_mut().insert(scope, node_1); + ts_node_bridge.borrow_mut().insert(scope, node_0, &idx); + ts_node_bridge.borrow_mut().insert(scope, node_1, &idx); let expected = [(0, "abc"), (1, "456")] .map(|(nid, text)| (format!("TS_NODES.get({}).text;", nid), text)); for (code, text) in &expected { @@ -311,6 +366,7 @@ const def = 456; let (mut runtime, ts_node_bridge) = setup_bridge(); let file_contents = "const abc = 123;"; let tree = TsTree::new(file_contents, Language::JavaScript); + let idx = LineColumnIndex::new(file_contents); let file_contents = Arc::::from(file_contents); let file_name = Arc::::from("file_name.js"); @@ -323,7 +379,7 @@ const def = 456; .borrow_mut() .set_root_context(scope, &tree.tree(), &file_contents, &file_name); let node_0 = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; - ts_node_bridge.borrow_mut().insert(scope, node_0); + ts_node_bridge.borrow_mut().insert(scope, node_0, &idx); let code = "\ const node = TS_NODES.get(0); diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs index 06926b951..82fce1ee3 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs @@ -5,6 +5,7 @@ use crate::analysis::ddsa_lib::bridge::TsNodeBridge; use crate::analysis::ddsa_lib::common::{swallow_v8_error, v8_uint}; use crate::analysis::ddsa_lib::test_utils::TsTree; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use graphviz_rust::dot_structures; use graphviz_rust::dot_structures::{Attribute, Stmt}; @@ -621,11 +622,17 @@ impl From> for dot_structures::Edge { impl<'a> LocatedNode<'a> { /// Constructs a new `LocatedNode` from a tree-sitter node. - pub fn new_cst(node: tree_sitter::Node, text: &'a str) -> LocatedNode<'a> { + /// + /// `source` is the full source text that was parsed to produce `node`; it is used via + /// [`LineColumnIndex`] to emit a 1-based UTF-16 code-unit column consistent with + /// [`TreeSitterNode::from_ts_node_with_index`]. + pub fn new_cst(node: tree_sitter::Node, text: &'a str, source: &str) -> LocatedNode<'a> { + let idx = LineColumnIndex::new(source); + let start = node.start_position(); Self::Cst { text, - line: node.start_position().row + 1, - col: node.start_position().column + 1, + line: start.row + 1, + col: idx.byte_col_to_utf16_col(start.row, start.column) as usize, cst_kind: node.kind(), } } @@ -1195,4 +1202,62 @@ strict digraph TestPhi { let expected = graphviz_rust::parse(expected).unwrap(); assert_eq!(collection.to_digraph(), expected); } + + /// **Regression-prevention test**: `LocatedNode::new_cst` emits a 1-based UTF-16 column that + /// differs from the raw `tree_sitter::Point.column + 1` value for multibyte input. + /// + /// If a future change reintroduces a raw `tree_sitter::Point.column + 1` read in + /// `LocatedNode::new_cst`, this test will fail on the multibyte assertion. + #[test] + fn located_node_multibyte_col_is_utf16() { + use crate::analysis::ddsa_lib::js::flow::graph::LocatedNode; + use crate::analysis::ddsa_lib::test_utils::TsTree; + use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; + + // Source: "πŸš€var x = 1;" + // πŸš€ = 4 UTF-8 bytes, 2 UTF-16 code units. + // `x` identifier starts at byte 8 (4 for πŸš€, 4 for "var "). + // UTF-16 prefix: πŸš€(2) + v(1) + a(1) + r(1) + ' '(1) = 6 units β†’ col 7. + // Raw byte col of `x` = 8 β†’ raw byte col + 1 = 9. + // (These differ because πŸš€ = 4 bytes but 2 UTF-16 units.) + let source = "\u{1F680}var x = 1;"; + let tree = TsTree::new(source, Language::JavaScript); + + // Find the `x` identifier node. + let x_node = tree.find_named_nodes(Some("x"), Some("identifier"))[0]; + let raw_byte_col_plus_one = x_node.start_position().column + 1; + + // LocatedNode::new_cst must produce a UTF-16 col β‰  raw byte col + 1. + let located = LocatedNode::new_cst(x_node, tree.text(x_node), source); + let located_col = match located { + LocatedNode::Cst { col, .. } => col, + _ => panic!("expected Cst variant"), + }; + + // UTF-16 col should be smaller than raw byte col on multibyte input. + assert_ne!( + located_col, raw_byte_col_plus_one, + "LocatedNode col should be UTF-16, not raw byte col + 1 (raw={raw_byte_col_plus_one})" + ); + // Sanity check: for this specific source, UTF-16 col must be 7. + // prefix "πŸš€var " = 2+1+1+1+1 = 6 UTF-16 units β†’ col 7 + assert_eq!(located_col, 7, "UTF-16 col of `x` after πŸš€ should be 7"); + + // Additionally, verify that for an ASCII source the values agree (regression guard). + let ascii_source = "var x = 1;"; + let ascii_tree = TsTree::new(ascii_source, Language::JavaScript); + let ascii_x = ascii_tree.find_named_nodes(Some("x"), Some("identifier"))[0]; + let ascii_raw = ascii_x.start_position().column + 1; + let ascii_located = LocatedNode::new_cst(ascii_x, ascii_tree.text(ascii_x), ascii_source); + let ascii_col = match ascii_located { + LocatedNode::Cst { col, .. } => col, + _ => panic!("expected Cst variant"), + }; + assert_eq!( + ascii_col, ascii_raw, + "ASCII: UTF-16 col should equal raw byte col + 1" + ); + let _ = LineColumnIndex::new(ascii_source); // ensure import is used + } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs index a107790f6..7eb613f8f 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs @@ -81,7 +81,7 @@ pub fn cst_dot_digraph( let tree = ts_tree.tree(); let candidates = TsTree::preorder_nodes(root_node.unwrap_or(tree.root_node())) .iter() - .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node))) + .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node), ts_tree.source())) .collect::>(); // The `String` in the tuple is the original ID of the vertex (as specified in the DOT). diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs index 25d103d2b..8761aad60 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs @@ -101,6 +101,7 @@ mod tests { use crate::model::common::Language; use crate::model::rule::{RuleCategory, RuleSeverity}; use common::model::position; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use std::sync::Arc; @@ -118,13 +119,14 @@ mod tests { pub(crate) fn setup(source_text: &str) -> (JsRuntime, TsTree) { let mut rt = cfg_test_v8().new_runtime(); let tree = TsTree::new(source_text, Language::Java); - let source_text = Arc::::from(source_text); + let idx = LineColumnIndex::new(source_text); + let source_arc = Arc::::from(source_text); let filename = Arc::::from("test_doesnt_use_filename"); - rt.bridge_context().borrow_mut().set_root_context(&mut rt.v8_handle_scope(), &tree.tree(), &source_text, &filename); + rt.bridge_context().borrow_mut().set_root_context(&mut rt.v8_handle_scope(), &tree.tree(), &source_arc, &filename); let tsn_bridge = rt.bridge_ts_node(); let mut tsn_bridge = tsn_bridge.borrow_mut(); for node in tree.find_named_nodes(None, None) { - tsn_bridge.insert(&mut rt.v8_handle_scope(), node); + tsn_bridge.insert(&mut rt.v8_handle_scope(), node, &idx); } (rt, tree) @@ -413,11 +415,18 @@ serialized; #[test] fn violation_taint_flow_regions() { let v_converter = ViolationConverter::new(); - fn position_eq(region: js::CodeRegion, node: tree_sitter::Node) -> bool { - region.start_line == (node.start_position().row as u32) + 1 - && region.start_col == (node.start_position().column as u32) + 1 - && region.end_line == (node.end_position().row as u32) + 1 - && region.end_col == (node.end_position().column as u32) + 1 + fn position_eq( + region: js::CodeRegion, + node: tree_sitter::Node, + source: &str, + ) -> bool { + let idx = LineColumnIndex::new(source); + let start = node.start_position(); + let end = node.end_position(); + region.start_line == (start.row as u32) + 1 + && region.start_col == idx.byte_col_to_utf16_col(start.row, start.column) + && region.end_line == (end.row as u32) + 1 + && region.end_col == idx.byte_col_to_utf16_col(end.row, end.column) } // language=java @@ -465,7 +474,7 @@ v; ) .unwrap(); - assert!(position_eq(violation.base_region, expected_flow[0])); + assert!(position_eq(violation.base_region, expected_flow[0], code)); let taint_flow_regions = violation.taint_flow_regions.unwrap(); assert_eq!(taint_flow_regions.len(), expected_flow.len()); @@ -473,7 +482,7 @@ v; .iter() .zip(expected_flow) .for_each(|(®ion, node)| { - assert!(position_eq(region, node)); + assert!(position_eq(region, node, code)); }); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs index 273133e44..b27858df8 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs @@ -5,6 +5,7 @@ use crate::analysis::ddsa_lib::common::{ load_function, swallow_v8_error, v8_uint, Class, DDSAJsRuntimeError, Instance, NodeId, }; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::marker::PhantomData; @@ -24,26 +25,32 @@ pub struct TreeSitterNode { } impl TreeSitterNode { - /// Converts the provided [`tree_sitter::Node`] into a `TreeSitterNode`, assigning the provided id. - pub fn from_ts_node(id: NodeId, node: tree_sitter::Node) -> Self { - // NOTE: We normalize the 0-based `tree_sitter::Point` to be 1-based. - fn normalize_ts_point_num(num: usize) -> u32 { - num as u32 + 1 - } + /// Converts the provided [`tree_sitter::Node`] into a `TreeSitterNode`, assigning the + /// provided id. + /// + /// Columns are produced as **1-based UTF-16 code-unit** values using `idx`, which must be + /// built from the same source string that was parsed to obtain `node`. Use this constructor + /// everywhere the source string is available β€” it is the single source of truth for + /// `Position.col` semantics. + pub fn from_ts_node_with_index( + id: NodeId, + node: tree_sitter::Node, + idx: &LineColumnIndex<'_>, + ) -> Self { let tree_sitter::Point { - row: start_line, - column: start_col, + row: start_row, + column: start_byte_col, } = node.start_position(); let tree_sitter::Point { - row: end_line, - column: end_col, + row: end_row, + column: end_byte_col, } = node.end_position(); Self { id, - start_line: normalize_ts_point_num(start_line), - start_col: normalize_ts_point_num(start_col), - end_line: normalize_ts_point_num(end_line), - end_col: normalize_ts_point_num(end_col), + start_line: (start_row as u32) + 1, + start_col: idx.byte_col_to_utf16_col(start_row, start_byte_col), + end_line: (end_row as u32) + 1, + end_col: idx.byte_col_to_utf16_col(end_row, end_byte_col), node_type_id: node.kind_id(), _pd: PhantomData, } @@ -104,6 +111,7 @@ mod tests { }; use crate::analysis::tree_sitter::get_tree_sitter_language; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use std::marker::PhantomData; use tree_sitter::StreamingIterator; @@ -152,7 +160,8 @@ mod tests { let mut parser = tree_sitter::Parser::new(); parser.set_language(&lang).unwrap(); - let tree = parser.parse(r#"foo(bar, baz);"#, None).unwrap(); + let source = r#"foo(bar, baz);"#; + let tree = parser.parse(source, None).unwrap(); let root = tree.root_node(); assert_eq!(root.start_position().row, 0); @@ -160,7 +169,8 @@ mod tests { assert_eq!(root.end_position().row, 0); assert_eq!(root.end_position().column, 14); - let tree_sitter_node = TreeSitterNode::::from_ts_node(0, root); + let idx = LineColumnIndex::new(source); + let tree_sitter_node = TreeSitterNode::::from_ts_node_with_index(0, root, &idx); assert_eq!(tree_sitter_node.start_line, 1); assert_eq!(tree_sitter_node.start_col, 1); @@ -168,6 +178,66 @@ mod tests { assert_eq!(tree_sitter_node.end_col, 15); } + /// Mirrors [`ts_node_line_col_one_based`] but with multibyte content. + /// + /// Source: `"\u{1F680}"; foo(bar);` + /// The emoji lives in a string literal so `foo` is a distinct identifier. + /// Bytes before `foo`: `"` (1) + πŸš€ (4) + `"` (1) + `;` (1) + ` ` (1) = 8 bytes. + /// UTF-16 prefix: `"` (1) + πŸš€ (2) + `"` (1) + `;` (1) + ` ` (1) = 6 units β†’ col 7. + #[test] + fn ts_node_line_col_multibyte() { + let lang = get_tree_sitter_language(&Language::JavaScript); + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&lang).unwrap(); + + // Source: `"\u{1F680}"; foo(bar);` + // πŸš€ is placed inside a string literal so `foo` is clearly a separate identifier. + let source = "\"\u{1F680}\"; foo(bar);"; + let tree = parser.parse(source, None).unwrap(); + + // The second statement is `foo(bar);` + let root = tree.root_node(); + let call_node = root + .child(1) // second expression_statement + .unwrap() + .child(0) // call_expression + .unwrap(); + + // `foo` starts at byte 8 ("πŸš€" = 6 bytes + "; " = 2 bytes). + let fn_node = call_node.child_by_field_name("function").unwrap(); + assert_eq!(fn_node.start_position().column, 8, "byte col of `foo` is 8"); + + let idx = LineColumnIndex::new(source); + let ts_node = TreeSitterNode::::from_ts_node_with_index(0, fn_node, &idx); + assert_eq!(ts_node.start_line, 1); + // UTF-16 col 7: " (1) + πŸš€ surrogate pair (2) + " (1) + ; (1) + space (1) = 6 β†’ col 7. + assert_eq!( + ts_node.start_col, 7, + "UTF-16 col of `foo` should be 7 (emoji as string literal)" + ); + + // Also verify with a CJK prefix: `"ζ—₯"; foo(bar);` + // ζ—₯ = 3 UTF-8 bytes, 1 UTF-16 unit. + // Bytes before foo: " (1) + ζ—₯ (3) + " (1) + ; (1) + space (1) = 7 bytes. + // UTF-16: " (1) + ζ—₯ (1) + " (1) + ; (1) + space (1) = 5 units β†’ col 6. + let source_cjk = "\"\u{65E5}\"; foo(bar);"; + let tree_cjk = parser.parse(source_cjk, None).unwrap(); + let call_cjk = tree_cjk.root_node().child(1).unwrap().child(0).unwrap(); + let fn_cjk = call_cjk.child_by_field_name("function").unwrap(); + assert_eq!( + fn_cjk.start_position().column, + 7, + "byte col of `foo` after ζ—₯ is 7" + ); + let idx_cjk = LineColumnIndex::new(source_cjk); + let ts_node_cjk = TreeSitterNode::::from_ts_node_with_index(1, fn_cjk, &idx_cjk); + // UTF-16 col 6: " (1) + ζ—₯ (1) + " (1) + ; (1) + space (1) = 5 β†’ col 6. + assert_eq!( + ts_node_cjk.start_col, 6, + "UTF-16 col of `foo` after ζ—₯ should be 6" + ); + } + /// Tests that we use the [`tree_sitter::Node::kind_id`] instead of the [`tree_sitter::Node::grammar_id`], which /// has subtle differences. #[rustfmt::skip] @@ -188,7 +258,8 @@ mod tests { // This should resolve to `assert_ne!("identifier", "property_identifier")` assert_ne!(ts_node.grammar_name(), ts_node.kind()); - let tree_sitter_node = TreeSitterNode::::from_ts_node(0, ts_node); + let idx = LineColumnIndex::new(text); + let tree_sitter_node = TreeSitterNode::::from_ts_node_with_index(0, ts_node, &idx); assert_eq!(tree_sitter_node.node_type_id, ts_node.kind_id()); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index 3fe23047c..be4de85f3 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -5,6 +5,7 @@ use crate::analysis::ddsa_lib; use crate::analysis::ddsa_lib::common::{v8_uint, NodeId}; use crate::analysis::ddsa_lib::{bridge, runtime, RawTSNode}; +use common::utils::position_utils::LineColumnIndex; use deno_core::{op2, v8, OpState}; use std::cell::RefCell; use std::rc::Rc; @@ -104,6 +105,12 @@ pub fn op_ts_node_named_children<'s>( None } else { let array = v8::Array::new(scope, count as i32); + // ctx_bridge is only needed in this branch. Build the index while ctx_borrow is alive; + // ts_node_bridge and ctx_bridge are separate RefCells so the concurrent borrows are valid. + let ctx_bridge = state.borrow::>>(); + let ctx_borrow = ctx_bridge.borrow(); + let source = ctx_borrow.ddsa_root_context().get_text().unwrap_or(""); + let idx = LineColumnIndex::new(source); let mut bridge_ref = ts_node_bridge.borrow_mut(); let mut cursor = ts_node.walk(); @@ -119,7 +126,7 @@ pub fn op_ts_node_named_children<'s>( } let child_node = cursor.node(); - let nid = bridge_ref.insert(scope, child_node); + let nid = bridge_ref.insert(scope, child_node, &idx); let nid = v8_uint(scope, nid); let nid_index = i * 2; array.set_index(scope, nid_index as u32, nid.into()); @@ -152,14 +159,16 @@ pub fn op_ts_node_parent( let safe_raw_ts_node = OpSafeRawTSNode::from_tsn_bridge(&ts_node_bridge.borrow(), node_id)?; let ts_node = safe_raw_ts_node.to_node(); - let ctx_bridge = ctx_bridge.borrow_mut(); - let root_ctx = ctx_bridge.ddsa_root_context(); + let ctx_borrow = ctx_bridge.borrow(); + let root_ctx = ctx_borrow.ddsa_root_context(); let safe_raw_parent = OpSafeRawTSNode::from_root_context(root_ctx, |ctx| ctx.get_ts_node_parent(ts_node))?; let parent_ts_node = safe_raw_parent.to_node(); + let source = root_ctx.get_text().unwrap_or(""); + let idx = LineColumnIndex::new(source); let mut bridge_ref = ts_node_bridge.borrow_mut(); - let nid = bridge_ref.insert(scope, parent_ts_node); + let nid = bridge_ref.insert(scope, parent_ts_node, &idx); Some(nid) } @@ -269,7 +278,7 @@ pub fn op_digraph_adjacency_list_to_dot( let node_text = ts_node .utf8_text(text.as_bytes()) .expect("bytes should be utf8 sequence"); - Some(LocatedNode::new_cst(ts_node, node_text)) + Some(LocatedNode::new_cst(ts_node, node_text, text)) } VertexKind::Phi => Some(LocatedNode::new_phi(vid.internal_id())), VertexKind::Invalid => None, diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs index 1c6a75a01..cd335650b 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs @@ -15,6 +15,7 @@ use crate::analysis::ddsa_lib::resource_watchdog::V8ResourceWatchdog; use crate::model::common::Language; use crate::model::rule::RuleInternal; use crate::model::violation; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::cell::RefCell; @@ -263,10 +264,11 @@ impl JsRuntime { // Add any rule arguments ctx_bridge.set_rule_arguments(scope, rule_arguments); - // Push the query matches: + // Push the query matches, using UTF-16 column conversion for every node. + let idx = LineColumnIndex::new(source_text.as_ref()); let mut ts_node_bridge = self.bridge_ts_node.borrow_mut(); self.bridge_query_match - .set_data(scope, query_matches, &mut ts_node_bridge); + .set_data(scope, query_matches, &mut ts_node_bridge, &idx); } // We use a bridge to pull violations, so we can ignore the return value with a noop handler. @@ -1446,6 +1448,44 @@ function visit(query, filename, code) { assert_eq!(*violation, expected); } + /// `stella_compat.js::getCode` correctly extracts an identifier that follows a multibyte + /// character on the same line. + /// + /// The source `"πŸš€protectedName = 1;"` has the emoji (4 UTF-8 bytes / 2 UTF-16 code units) + /// before the identifier. With UTF-16 columns, `col - 1` correctly indexes into the JS + /// string via `substring`, so `getCodeForNode(node, code)` returns `"protectedName"`. + #[test] + fn stella_compat_getcode_multibyte() { + let mut rt = cfg_test_v8().new_runtime(); + // πŸš€ is placed in a string literal so that `protectedName` is a distinct identifier. + // Bytes before `protectedName`: " (1) + πŸš€ (4) + " (1) + ; (1) + space (1) = 8 bytes. + // UTF-16 col: " (1) + πŸš€ surrogate pair (2) + " (1) + ; (1) + space (1) = 6 units β†’ col 7. + let text = "\"\u{1F680}\"; protectedName = 1;"; + let filename = "some_filename.js"; + let ts_query = r#"((identifier) @cap_name (#eq? @cap_name "protectedName"))"#; + let rule = r#" +function visit(query, filename, code) { + const node = query.captures["cap_name"]; + const nodeText = getCodeForNode(node, code); + const error = buildError( + node.start.line, + node.start.col, + node.end.line, + node.end.col, + nodeText + ); + addError(error); +} +"#; + let violations = + shorthand_execute_rule_internal(&mut rt, text, filename, ts_query, rule, None).unwrap(); + assert_eq!(violations.len(), 1); + let violation = violations.first().unwrap(); + assert_eq!(violation.message, "protectedName"); + // UTF-16 col 7: "(1) + πŸš€ surrogate pair(2) + "(1) + ;(1) + space(1) = 6 units β†’ col 7. + assert_eq!(violation.base_region.start_col, 7); + } + /// Passing a `VisitArgFilenameCompat` or `VisitArgCodeCompat` instance directly into console.log /// returns the contents of the underlying string. #[test] diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs index 821065f78..ac9d71db1 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs @@ -395,6 +395,11 @@ impl TsTree { Arc::clone(&self.tree) } + /// Returns the full source text. + pub fn source(&self) -> &str { + &self.text + } + /// Returns the text for the provided node. pub fn text(&self, node: tree_sitter::Node) -> &str { node.utf8_text(self.text.as_bytes()).unwrap() diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index b9fd80ba2..4660a1dc1 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -1,6 +1,7 @@ use crate::model::analysis::{MatchNode, MatchNodeContext, TreeSitterNode}; use crate::model::common::Language; use common::model::position::Position; +use common::utils::position_utils::LineColumnIndex; use indexmap::IndexMap; use std::collections::HashMap; use std::sync::Arc; @@ -297,17 +298,20 @@ pub fn get_query_nodes( ) -> Vec { let mut match_nodes: Vec = vec![]; + let idx = LineColumnIndex::new(code); + for query_match in query.cursor().matches(tree.root_node(), code, None) { let mut captures: HashMap = HashMap::new(); let mut captures_list: HashMap> = HashMap::new(); for capture in query_match { let list = match capture.contents { TSCaptureContent::Single(node) => { - map_node(node).map(|n| vec![n]).unwrap_or_default() - } - TSCaptureContent::Multi(nodes) => { - nodes.into_iter().filter_map(map_node).collect::>() + map_node(node, &idx).map(|n| vec![n]).unwrap_or_default() } + TSCaptureContent::Multi(nodes) => nodes + .into_iter() + .filter_map(|n| map_node(n, &idx)) + .collect::>(), }; // All captures are inserted into `captures_list`. However, the prior implementation continually // called `insert` on the `captures` map, which ended up re-writing the value every time. @@ -336,10 +340,15 @@ pub fn get_query_nodes( // map a node from the tree-sitter representation into our own internal representation // this is the representation that is passed to the JavaScript layer and how we represent // or expose the node to the end-user. -pub fn map_node(node: tree_sitter::Node) -> Option { +// +// `idx` is a pre-built [`LineColumnIndex`] for the source that produced `node`; it is used to +// convert tree-sitter's 0-based UTF-8 byte columns into 1-based UTF-16 code-unit columns so +// that `Position.col` is consistent with LSP / VS Code / SARIF v2.1 semantics. +pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex<'_>) -> Option { fn map_node_internal( cursor: &mut tree_sitter::TreeCursor, only_named_node: bool, + idx: &LineColumnIndex<'_>, ) -> Option { // we do not map space, parenthesis and other non-named nodes if there // when `only_named_node` is true (which is `true` for children only). @@ -352,7 +361,7 @@ pub fn map_node(node: tree_sitter::Node) -> Option { if cursor.goto_first_child() { loop { // For the child, we only want to capture named nodes to avoid polluting the AST. - let maybe_child = map_node_internal(cursor, true); + let maybe_child = map_node_internal(cursor, true, idx); if let Some(child) = maybe_child { children.push(child); } @@ -363,16 +372,19 @@ pub fn map_node(node: tree_sitter::Node) -> Option { cursor.goto_parent(); } + let start_point = cursor.node().range().start_point; + let end_point = cursor.node().range().end_point; + // finally, build the return value. let ts_node = TreeSitterNode { ast_type: cursor.node().kind().to_string(), start: Position { - line: u32::try_from(cursor.node().range().start_point.row + 1).unwrap(), - col: u32::try_from(cursor.node().range().start_point.column + 1).unwrap(), + line: u32::try_from(start_point.row + 1).unwrap(), + col: idx.byte_col_to_utf16_col(start_point.row, start_point.column), }, end: Position { - line: u32::try_from(cursor.node().range().end_point.row + 1).unwrap(), - col: u32::try_from(cursor.node().range().end_point.column + 1).unwrap(), + line: u32::try_from(end_point.row + 1).unwrap(), + col: idx.byte_col_to_utf16_col(end_point.row, end_point.column), }, field_name: cursor.field_name().map(ToString::to_string), children, @@ -385,7 +397,7 @@ pub fn map_node(node: tree_sitter::Node) -> Option { // Initially, we capture both un/named nodes to allow capturing unnamed node from // the tree-sitter query. - map_node_internal(&mut ts_cursor, false) + map_node_internal(&mut ts_cursor, false, idx) } #[cfg(test)] @@ -413,7 +425,8 @@ def func(): pass;"#; let t = get_tree(source_code, &Language::Python); assert!(t.is_some()); - let tree_node = map_node(t.unwrap().root_node()); + let idx = LineColumnIndex::new(source_code); + let tree_node = map_node(t.unwrap().root_node(), &idx); assert!(tree_node.is_some()); let root = tree_node.unwrap(); assert_eq!(2, root.children.len()); @@ -745,6 +758,115 @@ SELECT * FROM table WHERE column = 'value'; assert!(query_node.captures.contains_key("classname")); } + /// `map_node` emits 1-based UTF-16 columns. An emoji (U+1F680, 4 UTF-8 bytes, 2 UTF-16 + /// code units) before a named identifier must shift subsequent col values accordingly. + /// + /// Source: `x = "πŸš€"; num = 5` + /// UTF-8 byte layout (0-indexed, line 0): + /// x(0) ' '(1) =(2) ' '(3) "(4) πŸš€(5..8) "(9) ;(10) ' '(11) n(12) u(13) m(14) ' '(15) + /// UTF-16 layout: + /// x(1) ' '(2) =(3) ' '(4) "(5) πŸš€(6,7) "(8) ;(9) ' '(10) n(11) ... + /// β†’ `num` starts at UTF-16 col 11 (0-based byte 11, but πŸš€ takes 2 UTF-16 units β†’ 11-2+1 = 10?) + /// + /// Let's compute carefully: + /// bytes 0-10: `x = "πŸš€";` (11 bytes: x, ' ', =, ' ', ", πŸš€Γ—4, ", ;) + /// byte 11 (space), byte 12..14 = 'n','u','m' + /// UTF-16 prefix to byte 12: x(1),' '(1),=(1),' '(1),"(1),πŸš€(2),"(1),;(1),' '(1) = 10 units + /// β†’ num starts at UTF-16 col 11 (10 units + 1-based) + #[test] + fn test_map_node_multibyte_emoji() { + let source_code = "x = \"\u{1F680}\"; num = 5"; + let tree = get_tree(source_code, &Language::Python).unwrap(); + let idx = LineColumnIndex::new(source_code); + let root = map_node(tree.root_node(), &idx).unwrap(); + // Find the `num` identifier node in the AST. + fn find_node<'a>( + node: &'a crate::model::analysis::TreeSitterNode, + ast_type: &str, + text: &str, + ) -> Option<&'a crate::model::analysis::TreeSitterNode> { + // Check children recursively. + for child in &node.children { + if child.ast_type == ast_type { + return Some(child); + } + if let Some(found) = find_node(child, ast_type, text) { + return Some(found); + } + } + None + } + // We parse "x = "πŸš€"; num = 5" as Python. + // The second assignment has `num` as the left side (an identifier). + // Byte 12 = 'n', UTF-16 col = 11. + let second_stmt = root.children.get(1).unwrap(); // second expression_statement + let assignment = second_stmt.children.get(0).unwrap(); + // In Python AST, the left side of `num = 5` is an identifier. + let num_node = assignment + .children + .iter() + .find(|c| c.ast_type == "identifier") + .unwrap(); + // Byte offset of 'n' in `x = "πŸš€"; num = 5`: + // "x = \"" = 5 bytes, "πŸš€" = 4 bytes, "\"; " = 3 bytes = total 12 bytes before 'n' + // UTF-16 col: x(1),' '(1),=(1),' '(1),"(1),πŸš€_high(1),πŸš€_low(1),"(1),;(1),' '(1) = 10 β†’ col 11 + assert_eq!(num_node.start.col, 11, "UTF-16 col of `num` should be 11"); + assert_eq!(num_node.start.line, 1); + } + + /// `get_query_nodes` emits 1-based UTF-16 columns for CJK characters + /// (3 UTF-8 bytes, 1 UTF-16 code unit each). + /// + /// Source (Python): `ζ—₯ = 1; end = 2` + /// ζ—₯ = 3 UTF-8 bytes, 1 UTF-16 unit. + /// `end` starts at byte offset 7 (`ζ—₯`(3) + ' '(1) + '='(1) + ' '(1) + '1'(1) = 7), + /// but UTF-16 units before it: ζ—₯(1),' '(1),=(1),' '(1),1(1),;(1),' '(1) = 7 β†’ col 8. + /// Since ζ—₯ is BMP (1 UTF-16 unit = 1 byte col unit at the codepoint level, but 3 UTF-8 + /// bytes), we have: 3 bytes - 1 UTF-16 = 2 byte savings on that char, so byte 7 β†’ UTF-16 col + /// = 7 - 2 + 1 = 6. Let's recompute step by step: + /// ζ—₯(3 bytes β†’ 1 UTF-16), ' '(1β†’1), =(1β†’1), ' '(1β†’1), 1(1β†’1), ;(1β†’1), ' '(1β†’1) = 7 UTF-16 units β†’ col 8 + #[test] + fn test_get_query_nodes_cjk() { + // Python: assign ζ—₯ = 1, then end = 2 + let code = "\u{65E5} = 1; end = 2"; // ζ—₯ = 1; end = 2 + let q = "(identifier) @id"; + let tree = get_tree(code, &Language::Python).unwrap(); + let query = get_query(q, &Language::Python).expect("query defined"); + let nodes = get_query_nodes( + &tree, + &query, + "f.py", + code, + &std::collections::HashMap::new(), + ); + + // ζ—₯ is at byte 0, UTF-16 col 1 (ASCII fast path applies to byte 0). + // But ζ—₯ itself is an identifier. Its start byte col is 0 β†’ UTF-16 col 1. + let dai_col = nodes + .iter() + .flat_map(|mn| mn.captures.values()) + .find(|n| n.start.col == 1) + .map(|n| n.start.col) + .unwrap_or(0); + assert_eq!(dai_col, 1, "ζ—₯ starts at UTF-16 col 1"); + + // `end` starts at byte 11 in the source: + // ζ—₯(3) + ' '(1) + '='(1) + ' '(1) + '1'(1) + ';'(1) + ' '(1) = 9 bytes β†’ byte 9 + // Wait, let me recount: ζ—₯=3, ' '=1 β†’ 4, '='=1 β†’ 5, ' '=1 β†’ 6, '1'=1 β†’ 7, ';'=1 β†’ 8, ' '=1 β†’ 9 + // So 'e' in 'end' is at byte 9. + // UTF-16 prefix to byte 9: ζ—₯(1),' '(1),=(1),' '(1),1(1),;(1),' '(1) = 7 units β†’ col 8. + let end_col = nodes + .iter() + .flat_map(|mn| mn.captures.values()) + .find(|n| { + // The `end` identifier node is at col > 1. + n.start.col > 1 + }) + .map(|n| n.start.col) + .unwrap_or(0); + assert_eq!(end_col, 8, "UTF-16 col of `end` after CJK char should be 8"); + } + #[test] fn ts_query_cursor_matches_timeout() { let timeout = Duration::from_millis(500); diff --git a/crates/static-analysis-server/src/tree_sitter_tree.rs b/crates/static-analysis-server/src/tree_sitter_tree.rs index 1295c9475..905d6c8bc 100644 --- a/crates/static-analysis-server/src/tree_sitter_tree.rs +++ b/crates/static-analysis-server/src/tree_sitter_tree.rs @@ -2,6 +2,7 @@ use crate::constants::{ERROR_CODE_NOT_BASE64, ERROR_CODE_NO_ROOT_NODE}; use crate::model::tree_sitter_tree_node::ServerTreeSitterNode; use crate::model::tree_sitter_tree_request::TreeSitterRequest; use crate::model::tree_sitter_tree_response::TreeSitterResponse; +use common::utils::position_utils::LineColumnIndex; use kernel::analysis::tree_sitter::{get_tree, map_node}; use kernel::utils::decode_base64_string; @@ -38,9 +39,10 @@ pub fn process_tree_sitter_tree_request(request: TreeSitterRequest) -> TreeSitte return no_root_node; } - if let Some(result) = - result.map(|tree| map_node(tree.root_node()).map(ServerTreeSitterNode::from)) - { + if let Some(result) = result.map(|tree| { + let idx = LineColumnIndex::new(&decoded); + map_node(tree.root_node(), &idx).map(ServerTreeSitterNode::from) + }) { tracing::info!("Successfully completed tree-sitter AST generation"); TreeSitterResponse { result, @@ -71,6 +73,49 @@ mod tests { assert_eq!("module", response.result.unwrap().ast_type); } + /// `process_tree_sitter_tree_request` emits 1-based UTF-16 columns for non-ASCII content. + /// + /// Source: `x = "πŸš€"; num = 5` + /// The emoji (πŸš€, U+1F680) is 4 UTF-8 bytes / 2 UTF-16 code units. `num` starts at byte 12, + /// so its UTF-16 col = 10 (units before it) + 1 = 11. + #[test] + fn test_process_tree_sitter_tree_request_multibyte() { + // Source: `x = "πŸš€"; num = 5` + // printf 'x = "\xF0\x9F\x9A\x80"; num = 5' | base64 + let request = TreeSitterRequest { + code_base64: "eCA9ICLwn5qAIjsgbnVtID0gNQ==".to_string(), + file_encoding: "utf-8".to_string(), + language: Language::Python, + }; + let response = process_tree_sitter_tree_request(request); + assert!(response.errors.is_empty()); + let root = response.result.unwrap(); + + /// Depth-first search for a node whose `start.col` matches `expected_col`. + fn find_col( + node: &crate::model::tree_sitter_tree_node::ServerTreeSitterNode, + expected_col: u32, + ) -> bool { + if node.start.col == expected_col { + return true; + } + node.children.iter().any(|c| find_col(c, expected_col)) + } + + // `num` starts at byte 12 in `x = "πŸš€"; num = 5`. + // UTF-16 units before it: x(1) ' '(1) =(1) ' '(1) "(1) πŸš€(2) "(1) ;(1) ' '(1) = 10 β†’ col 11. + assert!( + find_col(&root, 11), + "expected a node with UTF-16 start.col == 11 (position of `num` after πŸš€)" + ); + + // Ensure that no node reports col 13 (what the raw byte col + 1 would give for `num`). + assert!( + !find_col(&root, 13), + "no node should report raw byte col 13 β€” that would indicate the UTF-16 fix is not applied" + ); + } + #[test] fn test_process_tree_sitter_invalid_base64() { let request = TreeSitterRequest { From 05755f2d1a5c73c411dd52adb5a436070c44aee9 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 11:52:24 +0200 Subject: [PATCH 02/12] refactor(tree_sitter): remove unused `find_node` function - Deleted the `find_node` function from `tree_sitter.rs` as it was not utilized in the current implementation. This cleanup helps streamline the code and improve maintainability. --- .../src/analysis/tree_sitter.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index 4660a1dc1..b0be02e1a 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -779,23 +779,6 @@ SELECT * FROM table WHERE column = 'value'; let tree = get_tree(source_code, &Language::Python).unwrap(); let idx = LineColumnIndex::new(source_code); let root = map_node(tree.root_node(), &idx).unwrap(); - // Find the `num` identifier node in the AST. - fn find_node<'a>( - node: &'a crate::model::analysis::TreeSitterNode, - ast_type: &str, - text: &str, - ) -> Option<&'a crate::model::analysis::TreeSitterNode> { - // Check children recursively. - for child in &node.children { - if child.ast_type == ast_type { - return Some(child); - } - if let Some(found) = find_node(child, ast_type, text) { - return Some(found); - } - } - None - } // We parse "x = "πŸš€"; num = 5" as Python. // The second assignment has `num` as the left side (an identifier). // Byte 12 = 'n', UTF-16 col = 11. From 76571835f5efd68d91f31f334c1238836bb75364 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:24:58 +0200 Subject: [PATCH 03/12] perf(ddsa): cache LineColumnIndex line-starts in RootContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added `LineColumnIndex::compute_line_starts` (extracted from `new`) and `LineColumnIndex::from_parts` to allow constructing an index from a pre-computed `Vec` without re-scanning the source. - Added a `line_starts: Vec` field to `RootContext`, populated in `set_text` so the scan runs once per file rather than once per op call. - Added `RootContext::line_column_index()` which builds a `LineColumnIndex` from the cached offsets in O(1) (a Vec clone, not a source scan). - Updated `op_ts_node_named_children` and `op_ts_node_parent` to use the cached index, eliminating O(source_len Γ— visited_nodes) behavior on deep tree walks or large files. Co-authored-by: Cursor --- crates/common/src/utils/position_utils.rs | 37 ++++++++++++++----- .../src/analysis/ddsa_lib/context/root.rs | 25 +++++++++++++ .../src/analysis/ddsa_lib/ops.rs | 15 ++++++-- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/crates/common/src/utils/position_utils.rs b/crates/common/src/utils/position_utils.rs index 6e85dfff0..853431f57 100644 --- a/crates/common/src/utils/position_utils.rs +++ b/crates/common/src/utils/position_utils.rs @@ -17,21 +17,40 @@ pub struct LineColumnIndex<'a> { impl<'a> LineColumnIndex<'a> { /// Builds the index by scanning `source` for newline characters. pub fn new(source: &'a str) -> Self { + let line_starts = Self::compute_line_starts(source); + Self { + source, + line_starts, + } + } + + /// Creates a [`LineColumnIndex`] from a **pre-computed** `line_starts` vector. + /// + /// Use this when `line_starts` has already been built by [`compute_line_starts`] and cached + /// (e.g. in `RootContext`) to avoid re-scanning the source on every call. + /// `line_starts` must have been computed from the same `source` string. + pub fn from_parts(source: &'a str, line_starts: Vec) -> Self { + Self { + source, + line_starts, + } + } + + /// Scans `source` and returns the byte offset of the start of each line. + /// + /// Intentionally splits on `\n` only β€” this mirrors tree-sitter's line model exactly. + /// Tree-sitter does not treat bare `\r` (old Mac OS 9) as a line terminator; for + /// Windows `\r\n` files the `\r` is part of the column on the same line, matching + /// tree-sitter's `Point.column` values. Using a broader splitter (e.g. `str::lines`) + /// would diverge from tree-sitter and produce wrong UTF-16 columns. + pub fn compute_line_starts(source: &str) -> Vec { let mut line_starts = vec![0usize]; for (i, b) in source.bytes().enumerate() { - // Intentionally split on `\n` only β€” this mirrors tree-sitter's line model exactly. - // Tree-sitter does not treat bare `\r` (old Mac OS 9) as a line terminator; for - // Windows `\r\n` files the `\r` is part of the column on the same line, matching - // tree-sitter's `Point.column` values. Using a broader splitter (e.g. `str::lines`) - // would diverge from tree-sitter and produce wrong UTF-16 columns. if b == b'\n' { line_starts.push(i + 1); } } - Self { - source, - line_starts, - } + line_starts } /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs index 9e50d29f5..874570cb0 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs @@ -3,6 +3,7 @@ // Copyright 2024 Datadog, Inc. use crate::analysis::ddsa_lib::RawTSNode; +use common::utils::position_utils::LineColumnIndex; use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; @@ -17,6 +18,11 @@ pub struct RootContext { tree: Option>, /// The source string that was parsed by tree-sitter to generate `tree`. tree_text: Option>, + /// Pre-computed byte offset of the start of each line in `tree_text`. + /// + /// Built once in [`set_text`] and reused by [`line_column_index`] to avoid re-scanning the + /// source on every deno op call (e.g. `namedChildren`, `parent`). + line_starts: Vec, /// A filename associated with a rule execution. filename: Option>, @@ -45,10 +51,29 @@ impl RootContext { /// Assigns the provided text string to the context. If an existing text string was assigned, it /// will be returned as `Some(text)`. + /// + /// Also rebuilds the internal [`line_starts`](Self::line_column_index) cache so that subsequent + /// calls to [`line_column_index`](Self::line_column_index) are O(1). pub fn set_text(&mut self, text: Arc) -> Option> { + self.line_starts = LineColumnIndex::compute_line_starts(&text); Option::replace(&mut self.tree_text, text) } + /// Returns a [`LineColumnIndex`] built from the cached line-start offsets. + /// + /// Construction is O(1) β€” the line-start offsets are pre-computed in [`set_text`] and stored + /// as a `Vec`. This avoids the O(source_len) scan that `LineColumnIndex::new` would + /// perform on every deno op call. + /// + /// Returns `None` when no source text has been assigned yet. + pub fn line_column_index(&self) -> Option> { + let source = self.tree_text.as_deref()?; + Some(LineColumnIndex::from_parts( + source, + self.line_starts.clone(), + )) + } + /// Returns a reference to the underlying [`tree_sitter::Tree`], if it exists. pub fn get_tree(&self) -> Option<&Arc> { self.tree.as_ref() diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index be4de85f3..94aa503e8 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -109,8 +109,12 @@ pub fn op_ts_node_named_children<'s>( // ts_node_bridge and ctx_bridge are separate RefCells so the concurrent borrows are valid. let ctx_bridge = state.borrow::>>(); let ctx_borrow = ctx_bridge.borrow(); - let source = ctx_borrow.ddsa_root_context().get_text().unwrap_or(""); - let idx = LineColumnIndex::new(source); + // Use the pre-computed line_starts cached in RootContext (O(1)) rather than + // rescanning the source on every namedChildren call. + let idx = ctx_borrow + .ddsa_root_context() + .line_column_index() + .unwrap_or_else(|| LineColumnIndex::new("")); let mut bridge_ref = ts_node_bridge.borrow_mut(); let mut cursor = ts_node.walk(); @@ -165,8 +169,11 @@ pub fn op_ts_node_parent( OpSafeRawTSNode::from_root_context(root_ctx, |ctx| ctx.get_ts_node_parent(ts_node))?; let parent_ts_node = safe_raw_parent.to_node(); - let source = root_ctx.get_text().unwrap_or(""); - let idx = LineColumnIndex::new(source); + // Use the pre-computed line_starts cached in RootContext (O(1)) rather than + // rescanning the source on every parent call. + let idx = root_ctx + .line_column_index() + .unwrap_or_else(|| LineColumnIndex::new("")); let mut bridge_ref = ts_node_bridge.borrow_mut(); let nid = bridge_ref.insert(scope, parent_ts_node, &idx); Some(nid) From a719e85877a376f6a3d78ef7dfcaa5269f9c699f Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:27:30 +0200 Subject: [PATCH 04/12] perf(ddsa): refactor LocatedNode::new_cst to accept &LineColumnIndex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old signature built a new LineColumnIndex (O(source_len) scan) on every node, making op_digraph_adjacency_list_to_dot O(source_len Γ— CST_vertices). Change new_cst to accept &LineColumnIndex<'_> so callers own the index lifetime and can build it once before any loop: - ops.rs (op_digraph_adjacency_list_to_dot): build once before the closure, re-using the RootContext-cached index when available. - graph_test_utils.rs (cst_dot_digraph): build once before the iterator. - graph.rs (located_node_multibyte_col_is_utf16 test): pass &idx directly. Co-authored-by: Cursor --- .../src/analysis/ddsa_lib/js/flow/graph.rs | 20 +++++++++++-------- .../ddsa_lib/js/flow/graph_test_utils.rs | 5 ++++- .../src/analysis/ddsa_lib/ops.rs | 10 +++++++--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs index 82fce1ee3..72c925800 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs @@ -623,11 +623,14 @@ impl From> for dot_structures::Edge { impl<'a> LocatedNode<'a> { /// Constructs a new `LocatedNode` from a tree-sitter node. /// - /// `source` is the full source text that was parsed to produce `node`; it is used via - /// [`LineColumnIndex`] to emit a 1-based UTF-16 code-unit column consistent with - /// [`TreeSitterNode::from_ts_node_with_index`]. - pub fn new_cst(node: tree_sitter::Node, text: &'a str, source: &str) -> LocatedNode<'a> { - let idx = LineColumnIndex::new(source); + /// `idx` must be a [`LineColumnIndex`] built from the same source text that was parsed to + /// produce `node`. Pass a single index built at the call site β€” do **not** construct one + /// per-node, as that would scan the entire source on every call. + pub fn new_cst( + node: tree_sitter::Node, + text: &'a str, + idx: &LineColumnIndex<'_>, + ) -> LocatedNode<'a> { let start = node.start_position(); Self::Cst { text, @@ -1229,7 +1232,8 @@ strict digraph TestPhi { let raw_byte_col_plus_one = x_node.start_position().column + 1; // LocatedNode::new_cst must produce a UTF-16 col β‰  raw byte col + 1. - let located = LocatedNode::new_cst(x_node, tree.text(x_node), source); + let idx = LineColumnIndex::new(source); + let located = LocatedNode::new_cst(x_node, tree.text(x_node), &idx); let located_col = match located { LocatedNode::Cst { col, .. } => col, _ => panic!("expected Cst variant"), @@ -1249,7 +1253,8 @@ strict digraph TestPhi { let ascii_tree = TsTree::new(ascii_source, Language::JavaScript); let ascii_x = ascii_tree.find_named_nodes(Some("x"), Some("identifier"))[0]; let ascii_raw = ascii_x.start_position().column + 1; - let ascii_located = LocatedNode::new_cst(ascii_x, ascii_tree.text(ascii_x), ascii_source); + let ascii_idx = LineColumnIndex::new(ascii_source); + let ascii_located = LocatedNode::new_cst(ascii_x, ascii_tree.text(ascii_x), &ascii_idx); let ascii_col = match ascii_located { LocatedNode::Cst { col, .. } => col, _ => panic!("expected Cst variant"), @@ -1258,6 +1263,5 @@ strict digraph TestPhi { ascii_col, ascii_raw, "ASCII: UTF-16 col should equal raw byte col + 1" ); - let _ = LineColumnIndex::new(ascii_source); // ensure import is used } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs index 7eb613f8f..7d5c4da12 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs @@ -12,6 +12,7 @@ use crate::analysis::ddsa_lib::js::flow::graph::{ VertexId, VertexKind, COL, CST_KIND, KIND, LINE, TEXT, V_KIND, }; use crate::analysis::ddsa_lib::test_utils::TsTree; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use graphviz_rust::dot_structures; @@ -79,9 +80,11 @@ pub fn cst_dot_digraph( let stmts = digraph_stmts(&graph); let tree = ts_tree.tree(); + // Build the index once before the per-node iterator to avoid O(source_len Γ— nodes). + let lc_idx = LineColumnIndex::new(ts_tree.source()); let candidates = TsTree::preorder_nodes(root_node.unwrap_or(tree.root_node())) .iter() - .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node), ts_tree.source())) + .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node), &lc_idx)) .collect::>(); // The `String` in the tuple is the original ID of the vertex (as specified in the DOT). diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index 94aa503e8..8986e6280 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -264,10 +264,14 @@ pub fn op_digraph_adjacency_list_to_dot( let tsn_bridge = tsn_bridge.borrow(); let ctx_bridge = state.borrow::>>(); let ctx_bridge = ctx_bridge.borrow(); - let text = ctx_bridge - .ddsa_root_context() + let root_ctx = ctx_bridge.ddsa_root_context(); + let text = root_ctx .get_text() .expect("tree text should always be `Some` during rule execution"); + // Build the index once before the per-vertex loop; use the cached line_starts when available. + let lc_idx = root_ctx + .line_column_index() + .unwrap_or_else(|| LineColumnIndex::new(text)); // Transformation: // If `VertexKind::CST`: constructs a dot node from metadata from the `TsNodeBridge` and `RootContext`. @@ -285,7 +289,7 @@ pub fn op_digraph_adjacency_list_to_dot( let node_text = ts_node .utf8_text(text.as_bytes()) .expect("bytes should be utf8 sequence"); - Some(LocatedNode::new_cst(ts_node, node_text, text)) + Some(LocatedNode::new_cst(ts_node, node_text, &lc_idx)) } VertexKind::Phi => Some(LocatedNode::new_phi(vid.internal_id())), VertexKind::Invalid => None, From ef4527b880192b61fffe05638bedcf8c050c82b3 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:28:27 +0200 Subject: [PATCH 05/12] docs(test): rewrite test_map_node_multibyte_emoji comment to one clean calculation The old comment had a tangent line ending in "10?" that contradicted the actual expected value of 11. Replace with a single, step-by-step UTF-16 prefix sum that maps directly to the assert_eq! below it. Co-authored-by: Cursor --- .../src/analysis/tree_sitter.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index b0be02e1a..3d85cf4bb 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -763,16 +763,11 @@ SELECT * FROM table WHERE column = 'value'; /// /// Source: `x = "πŸš€"; num = 5` /// UTF-8 byte layout (0-indexed, line 0): - /// x(0) ' '(1) =(2) ' '(3) "(4) πŸš€(5..8) "(9) ;(10) ' '(11) n(12) u(13) m(14) ' '(15) - /// UTF-16 layout: - /// x(1) ' '(2) =(3) ' '(4) "(5) πŸš€(6,7) "(8) ;(9) ' '(10) n(11) ... - /// β†’ `num` starts at UTF-16 col 11 (0-based byte 11, but πŸš€ takes 2 UTF-16 units β†’ 11-2+1 = 10?) + /// x(0) ' '(1) =(2) ' '(3) "(4) πŸš€(5-8) "(9) ;(10) ' '(11) n(12) u(13) m(14) ... /// - /// Let's compute carefully: - /// bytes 0-10: `x = "πŸš€";` (11 bytes: x, ' ', =, ' ', ", πŸš€Γ—4, ", ;) - /// byte 11 (space), byte 12..14 = 'n','u','m' - /// UTF-16 prefix to byte 12: x(1),' '(1),=(1),' '(1),"(1),πŸš€(2),"(1),;(1),' '(1) = 10 units - /// β†’ num starts at UTF-16 col 11 (10 units + 1-based) + /// `num` starts at byte 12. UTF-16 prefix for bytes 0..12: + /// x(1) + ' '(1) + =(1) + ' '(1) + "(1) + πŸš€(2) + "(1) + ;(1) + ' '(1) = 10 UTF-16 units + /// β†’ `num` is at 1-based UTF-16 col 11. #[test] fn test_map_node_multibyte_emoji() { let source_code = "x = \"\u{1F680}\"; num = 5"; From 0243860ef8a2d1af53900c81bb39bce767a62f9a Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:29:05 +0200 Subject: [PATCH 06/12] docs(test): fix contradictory byte-offset derivation in test_get_query_nodes_cjk The comment had a first derivation stating end starts at byte 7, then a second "let's recompute" block arriving at byte 9. Remove the wrong first derivation and keep only the corrected step-by-step calculation. Co-authored-by: Cursor --- .../static-analysis-kernel/src/analysis/tree_sitter.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index 3d85cf4bb..c2774a0c5 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -797,12 +797,10 @@ SELECT * FROM table WHERE column = 'value'; /// /// Source (Python): `ζ—₯ = 1; end = 2` /// ζ—₯ = 3 UTF-8 bytes, 1 UTF-16 unit. - /// `end` starts at byte offset 7 (`ζ—₯`(3) + ' '(1) + '='(1) + ' '(1) + '1'(1) = 7), - /// but UTF-16 units before it: ζ—₯(1),' '(1),=(1),' '(1),1(1),;(1),' '(1) = 7 β†’ col 8. - /// Since ζ—₯ is BMP (1 UTF-16 unit = 1 byte col unit at the codepoint level, but 3 UTF-8 - /// bytes), we have: 3 bytes - 1 UTF-16 = 2 byte savings on that char, so byte 7 β†’ UTF-16 col - /// = 7 - 2 + 1 = 6. Let's recompute step by step: - /// ζ—₯(3 bytes β†’ 1 UTF-16), ' '(1β†’1), =(1β†’1), ' '(1β†’1), 1(1β†’1), ;(1β†’1), ' '(1β†’1) = 7 UTF-16 units β†’ col 8 + /// + /// `end` starts at byte 9: ζ—₯(3) + ' '(1) + =(1) + ' '(1) + 1(1) + ;(1) + ' '(1) = 9 bytes. + /// UTF-16 prefix for those 9 bytes: + /// ζ—₯(1) + ' '(1) + =(1) + ' '(1) + 1(1) + ;(1) + ' '(1) = 7 UTF-16 units β†’ 1-based col 8. #[test] fn test_get_query_nodes_cjk() { // Python: assign ζ—₯ = 1, then end = 2 From 60e4f955142b354a2106af83628a3427e24a7fb3 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:29:42 +0200 Subject: [PATCH 07/12] docs(test): fix misleading header for stella_compat_getcode_multibyte MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous header said the source was 'πŸš€protectedName = 1;' (emoji directly preceding the identifier), but the actual source places the emoji in a string literal: '"\u{1F680}"; protectedName = 1;'. Rewrite the header to match the real source layout and update the byte/UTF-16 derivation. Co-authored-by: Cursor --- .../src/analysis/ddsa_lib/runtime.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs index cd335650b..b4322fb7f 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs @@ -1451,9 +1451,12 @@ function visit(query, filename, code) { /// `stella_compat.js::getCode` correctly extracts an identifier that follows a multibyte /// character on the same line. /// - /// The source `"πŸš€protectedName = 1;"` has the emoji (4 UTF-8 bytes / 2 UTF-16 code units) - /// before the identifier. With UTF-16 columns, `col - 1` correctly indexes into the JS - /// string via `substring`, so `getCodeForNode(node, code)` returns `"protectedName"`. + /// The source is `"\u{1F680}"; protectedName = 1;` β€” the emoji sits inside a string literal + /// so that `protectedName` is a distinct, standalone identifier token. + /// Bytes before `protectedName`: `"` (1) + πŸš€ (4) + `"` (1) + `;` (1) + ` ` (1) = 8 bytes. + /// UTF-16 prefix: `"` (1) + πŸš€ surrogate pair (2) + `"` (1) + `;` (1) + ` ` (1) = 6 units β†’ col 7. + /// With UTF-16 columns, `col - 1` correctly indexes into the JS string via `substring`, + /// so `getCodeForNode(node, code)` returns `"protectedName"`. #[test] fn stella_compat_getcode_multibyte() { let mut rt = cfg_test_v8().new_runtime(); From 04e6d1fe3c918e152e2fe8346e6ddcfff6777291 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:30:17 +0200 Subject: [PATCH 08/12] docs: clarify Position.col producer contracts in docstring The previous docstring led with UTF-16 as the universal meaning, then footnoted the secrets-scanner exception. Rewrite so both contracts are equal-weight bullet points: kernel (UTF-16 code units) and secrets scanner (Unicode grapheme clusters). Consumers reading just the field-level doc now see the full picture without having to read further. Co-authored-by: Cursor --- crates/common/src/model/position.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/common/src/model/position.rs b/crates/common/src/model/position.rs index 0d85c26db..503b29c12 100644 --- a/crates/common/src/model/position.rs +++ b/crates/common/src/model/position.rs @@ -9,15 +9,15 @@ use std::fmt; /// A source-code position. /// /// * `line` β€” 1-based line number. -/// * `col` β€” 1-based **UTF-16 code-unit** column number for every position produced by the -/// static-analysis kernel (tree-sitter path). On ASCII-only lines, one UTF-16 code unit equals -/// one byte, so the value is identical to what a byte-column would be. For non-ASCII characters -/// (CJK ideographs, emoji surrogate pairs, combining marks, etc.) the value reflects UTF-16 -/// semantics, which matches LSP / VS Code and the SARIF v2.1 default encoding. -/// -/// The `get_position_in_string` helper (used by the secrets scanner) emits grapheme-based -/// columns under a different contract; that path does **not** flow through `Position.col` in the -/// kernel output. +/// * `col` β€” 1-based column number. The unit depends on the producer: +/// - **Static-analysis kernel** (tree-sitter path): UTF-16 code-unit column. Matches LSP, +/// VS Code, and the SARIF v2.1 default encoding. On ASCII-only lines, one UTF-16 code unit +/// equals one byte, so the value is identical to a byte-column. For non-ASCII characters +/// (CJK ideographs, emoji surrogate pairs, combining marks, etc.) the count reflects UTF-16 +/// surrogate-pair expansion. +/// - **Secrets scanner** (`get_position_in_string`): Unicode-grapheme-cluster column. A single +/// user-perceived character (including emoji and composed sequences) counts as one unit +/// regardless of its UTF-8 or UTF-16 size. #[derive(Deserialize, Debug, Serialize, Clone, Copy, Builder, PartialEq, Eq, Hash)] pub struct Position { pub line: u32, From d39db47e912b71bba7b0bc218ea2bc67745101ef Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 12:43:49 +0200 Subject: [PATCH 09/12] fix(ops): replace silent unwrap_or_else fallback with expect The unwrap_or_else(|| LineColumnIndex::new("")) fallback would silently produce col=1 for every node if tree text was never set, making bugs invisible. Replace with expect() in both op_ts_node_named_children and op_ts_node_parent, consistent with the already-present panic in op_digraph_adjacency_list_to_dot for the same invariant. Co-authored-by: Cursor --- crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index 8986e6280..bec8db64f 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -114,7 +114,7 @@ pub fn op_ts_node_named_children<'s>( let idx = ctx_borrow .ddsa_root_context() .line_column_index() - .unwrap_or_else(|| LineColumnIndex::new("")); + .expect("tree text must be set before ops run"); let mut bridge_ref = ts_node_bridge.borrow_mut(); let mut cursor = ts_node.walk(); @@ -173,7 +173,7 @@ pub fn op_ts_node_parent( // rescanning the source on every parent call. let idx = root_ctx .line_column_index() - .unwrap_or_else(|| LineColumnIndex::new("")); + .expect("tree text must be set before ops run"); let mut bridge_ref = ts_node_bridge.borrow_mut(); let nid = bridge_ref.insert(scope, parent_ts_node, &idx); Some(nid) From 329df0ccdfaf560c71bd32546fb76ec4155c92d0 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Sun, 17 May 2026 17:16:58 +0200 Subject: [PATCH 10/12] =?UTF-8?q?ci(regressions):=20temporarily=20exclude?= =?UTF-8?q?=20columns=20from=20SARIF=20diff=20for=20byte=E2=86=92UTF-16=20?= =?UTF-8?q?transition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR changes the kernel from emitting 1-based UTF-8 byte columns to 1-based UTF-16 code-unit columns (matching LSP / VS Code / SARIF v2.1). While the PR is in review, the regression workflow compares two SARIF runs as JSON strings β€” one built from `main` (byte cols) and one from this branch (UTF-16 cols). Every violation that lives on a line with a non-ASCII character drifts by N columns (N = number of multibyte chars before the position) and shows up as a "removed + added" pair. To unblock the CI we drop `startColumn` / `endColumn` from both the comparison key (in `parseFile`) and the summary location display, and compare by (file, ruleId, message, startLine, endLine) only. This is a ONE-SHOT loosening for the encoding transition. A stacked follow-up PR will restore the column fields once this PR lands on `main` (at which point both runs are on UTF-16 columns again and column-level regression detection becomes meaningful). Verified locally on numpy's `_core/tests/test_strings.py`: - before fix: 3 "removed" + 3 "added" false positives (lines with Ξ» ΞΌ Β·) - after fix: 0 + 0 (clean) Co-authored-by: Cursor --- .github/scripts/check-regressions.js | 35 +++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/.github/scripts/check-regressions.js b/.github/scripts/check-regressions.js index 15bd5303a..0126f5fdd 100644 --- a/.github/scripts/check-regressions.js +++ b/.github/scripts/check-regressions.js @@ -22,7 +22,33 @@ const parseFile = (filepath) => { const current = {}; current.message = i.message; current.ruleId = i.ruleId; - current.physicalLocation = i.locations[0].physicalLocation; + // ------------------------------------------------------------------ + // IMPORTANT (TEMPORARY): strip `startColumn` / `endColumn` from the region. + // + // PR #914 changes the kernel from emitting 1-based UTF-8 byte columns to + // 1-based UTF-16 code-unit columns (matching LSP / VS Code / SARIF v2.1). + // + // While that PR is in review, `main` still emits byte columns and the + // feature branch emits UTF-16 columns. The regression check compares + // results as JSON strings, so it would flag every violation that lives + // on a line containing a non-ASCII character as a "removed + added" + // pair β€” same file, same rule, same line; only the column number drifts + // by N (where N is the number of multibyte chars before the position). + // + // To unblock CI for #914 we compare by + // (file, ruleId, message, startLine, endLine) + // and intentionally ignore columns. This is a one-shot loosening for + // the byteβ†’UTF-16 transition. A stacked follow-up PR will restore + // `startColumn` / `endColumn` to the comparison key once #914 lands on + // `main` (at which point both runs are on UTF-16 columns again and + // column-level regression detection becomes meaningful again). + // ------------------------------------------------------------------ + const { startColumn: _startCol, endColumn: _endCol, ...regionWithoutColumns } = + i.locations[0].physicalLocation.region; + current.physicalLocation = { + ...i.locations[0].physicalLocation, + region: regionWithoutColumns, + }; results.push(current); } @@ -55,13 +81,16 @@ const main = async () => { let table1 = []; if (count1 > 0 || count2 > 0) { + // Location display only shows `startLine-endLine`. Columns are + // intentionally omitted to match the comparison key (see `parseFile` + // above for the full rationale on the byteβ†’UTF-16 transition). for (const item of diff1) { const json = JSON.parse(item); table1.push([ { data: json.physicalLocation.artifactLocation.uri }, { data: json.message.text }, { data: json.ruleId }, - { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` }, + { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` }, { data: dupes1[json.ruleId] }, ]); } @@ -74,7 +103,7 @@ const main = async () => { { data: json.physicalLocation.artifactLocation.uri }, { data: json.message.text }, { data: json.ruleId }, - { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` }, + { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` }, { data: dupes2[json.ruleId] }, ]); } From 8947d3bbda2decb494140adb628ce37085abcf24 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Tue, 19 May 2026 16:28:24 +0200 Subject: [PATCH 11/12] refactor(common): use line-index crate for UTF-16 column conversion Replace hand-rolled LineColumnIndex implementation with a thin newtype wrapper around line_index::LineIndex. Public API and all call sites are unchanged. Simplifies RootContext cache to own the index directly, eliminating per-call Vec clones. Addresses reviewer feedback on #914. --- Cargo.lock | 17 ++++ LICENSE-3rdparty.csv | 1 + crates/common/Cargo.toml | 3 +- crates/common/src/utils/position_utils.rs | 96 ++++++------------- .../analysis/ddsa_lib/bridge/query_match.rs | 2 +- .../src/analysis/ddsa_lib/bridge/ts_node.rs | 6 +- .../src/analysis/ddsa_lib/context/root.rs | 29 +++--- .../src/analysis/ddsa_lib/js/flow/graph.rs | 2 +- .../src/analysis/ddsa_lib/js/ts_node.rs | 2 +- .../src/analysis/ddsa_lib/ops.rs | 16 ++-- .../src/analysis/tree_sitter.rs | 4 +- 11 files changed, 75 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2761a58b5..2d0280439 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -696,6 +696,7 @@ dependencies = [ "anyhow", "bstr", "derive_builder", + "line-index", "serde", ] @@ -2540,6 +2541,16 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "line-index" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e27e0ed5a392a7f5ba0b3808a2afccff16c64933312c84b57618b49d1209bd2" +dependencies = [ + "nohash-hasher", + "text-size", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -2766,6 +2777,12 @@ dependencies = [ "smallvec", ] +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + [[package]] name = "nom" version = "7.1.3" diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 20a3e1316..543bdb646 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -12,6 +12,7 @@ graphviz-rust,https://github.com/besok/graphviz-rust,MIT,Copyright (c) 2013 Bori indexmap,https://github.com/indexmap-rs/indexmap,MIT and Apache-2.0,Copyright (c) 2016-2017 bluss indicatif,https://crates.io/crates/indicatif,MIT,Copyright (c) 2017 Armin Ronacher itertools,https://github.com/rust-itertools/itertools,MIT,Copyright 2015 itertools Developers +line-index,https://github.com/rust-lang/rust-analyzer,MIT OR Apache-2.0,Copyright (c) 2019 rust-analyzer developers num_cpus,https://github.com/seanmonstar/num_cpus,MIT, Copyright (c) 2015 Sean McArthur percent-encoding,https://github.com/servo/rust-url/,MIT, Copyright (c) 2013-2022 The rust-url developers prettytable-rs,https://github.com/phsym/prettytable-rs/,BSD-3-Clause,Copyright (c) 2022 Pierre-Henri Symoneaux diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 52512fccb..a5a4997f3 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -9,4 +9,5 @@ serde = { version = "1.0.219", features = ["derive"] } derive_builder = { workspace = true } # other -bstr = "1.12.0" \ No newline at end of file +bstr = "1.12.0" +line-index = "0.1.2" \ No newline at end of file diff --git a/crates/common/src/utils/position_utils.rs b/crates/common/src/utils/position_utils.rs index 853431f57..53575c228 100644 --- a/crates/common/src/utils/position_utils.rs +++ b/crates/common/src/utils/position_utils.rs @@ -1,87 +1,45 @@ use crate::model::position::Position; use bstr::BStr; use bstr::ByteSlice; +use line_index::{LineCol, LineIndex, WideEncoding}; /// Precomputed per-line index for fast repeated UTF-8 byte-column β†’ UTF-16 code-unit column /// conversion. /// /// Build once per source string with [`LineColumnIndex::new`], then call -/// [`byte_col_to_utf16_col`] for every tree-sitter node on that source. -pub struct LineColumnIndex<'a> { - source: &'a str, - /// Byte offset of the start of each line (0-indexed line number β†’ byte offset of first byte - /// on that line). - line_starts: Vec, -} - -impl<'a> LineColumnIndex<'a> { - /// Builds the index by scanning `source` for newline characters. - pub fn new(source: &'a str) -> Self { - let line_starts = Self::compute_line_starts(source); - Self { - source, - line_starts, - } - } - - /// Creates a [`LineColumnIndex`] from a **pre-computed** `line_starts` vector. - /// - /// Use this when `line_starts` has already been built by [`compute_line_starts`] and cached - /// (e.g. in `RootContext`) to avoid re-scanning the source on every call. - /// `line_starts` must have been computed from the same `source` string. - pub fn from_parts(source: &'a str, line_starts: Vec) -> Self { - Self { - source, - line_starts, - } - } +/// [`byte_col_to_utf16_col`](LineColumnIndex::byte_col_to_utf16_col) for every tree-sitter node +/// on that source. Backed by [`line_index::LineIndex`] from the rust-analyzer project. +/// +/// ## Line model +/// +/// [`line_index::LineIndex`] splits on `\n` only, which mirrors tree-sitter's line model exactly. +/// Tree-sitter does not treat a bare `\r` (classic Mac OS 9) as a line terminator; for Windows +/// `\r\n` files the `\r` is counted as part of the column on the same line, matching +/// tree-sitter's `Point.column` values. Using a broader splitter (e.g. Unicode line endings) +/// would diverge from tree-sitter and produce wrong UTF-16 columns. +#[derive(Debug)] +pub struct LineColumnIndex(LineIndex); - /// Scans `source` and returns the byte offset of the start of each line. - /// - /// Intentionally splits on `\n` only β€” this mirrors tree-sitter's line model exactly. - /// Tree-sitter does not treat bare `\r` (old Mac OS 9) as a line terminator; for - /// Windows `\r\n` files the `\r` is part of the column on the same line, matching - /// tree-sitter's `Point.column` values. Using a broader splitter (e.g. `str::lines`) - /// would diverge from tree-sitter and produce wrong UTF-16 columns. - pub fn compute_line_starts(source: &str) -> Vec { - let mut line_starts = vec![0usize]; - for (i, b) in source.bytes().enumerate() { - if b == b'\n' { - line_starts.push(i + 1); - } - } - line_starts +impl LineColumnIndex { + /// Builds the index by scanning `source`. + pub fn new(source: &str) -> Self { + Self(LineIndex::new(source)) } /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit /// column. /// - /// **ASCII fast path**: when the line prefix up to `byte_col` is pure ASCII (all bytes - /// < 128), each byte is one UTF-16 code unit, so the result is simply `byte_col + 1`. - /// This path costs one slice-level `is_ascii()` check and is taken for the vast majority of - /// real-world source lines. + /// Falls back to `byte_col + 1` (correct for ASCII-only content) when the coordinates fall + /// outside the indexed source, preserving pre-existing behaviour for out-of-range inputs. pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> u32 { - let line_start = self - .line_starts - .get(row) - .copied() - .unwrap_or(self.source.len()); - // Slice from the line start to end-of-source (unbounded); the subsequent `prefix` slice - // clamps this to exactly `byte_col` bytes, so the trailing lines are never examined. - let source_from_line_start = &self.source.as_bytes()[line_start..]; - // `byte_col` is the number of bytes from the line start to the position. - let prefix_len = byte_col.min(source_from_line_start.len()); - let prefix = &source_from_line_start[..prefix_len]; - - // ASCII fast path: one byte == one UTF-16 code unit. - if prefix.is_ascii() { - return (byte_col + 1) as u32; - } - - // Slow path: count UTF-16 code units emitted by each Unicode scalar. - let prefix_str = std::str::from_utf8(prefix).unwrap_or(""); - let utf16_col: usize = prefix_str.chars().map(|c| c.len_utf16()).sum(); - (utf16_col + 1) as u32 + let lc = LineCol { + line: row as u32, + col: byte_col as u32, + }; + self.0 + .to_wide(WideEncoding::Utf16, lc) + .map(|w| w.col + 1) + .unwrap_or((byte_col as u32) + 1) } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs index 24054f3e7..990dd5b58 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs @@ -45,7 +45,7 @@ impl QueryMatchBridge { scope: &mut HandleScope, matches: impl Into>>>, node_bridge: &mut TsNodeBridge, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) { let matches = matches.into(); // Pass each node in via the bridge (assigning it an id), and use this id to transform diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs index 2f17fa07b..1e76229d7 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs @@ -48,7 +48,7 @@ impl TsNodeBridge { &mut self, scope: &mut HandleScope, node: tree_sitter::Node, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> NodeId { let raw_ts_node = RawTSNode::new(node); if let Some((_, _, node_id)) = self.mirrored_im.get_full(&raw_ts_node) { @@ -92,7 +92,7 @@ impl TsNodeBridge { scope: &mut HandleScope<'s>, node: tree_sitter::Node, id: NodeId, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> v8::Local<'s, v8::Object> { let ts_node = TreeSitterNode::::from_ts_node_with_index(id, node, idx); self.js_class.new_instance(scope, ts_node) @@ -128,7 +128,7 @@ impl TsNodeBridge { &mut self, scope: &mut HandleScope, capture: TSQueryCapture, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> TSQueryCapture { TSQueryCapture:: { name: capture.name, diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs index 874570cb0..c8f76e92a 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs @@ -18,11 +18,11 @@ pub struct RootContext { tree: Option>, /// The source string that was parsed by tree-sitter to generate `tree`. tree_text: Option>, - /// Pre-computed byte offset of the start of each line in `tree_text`. + /// Pre-computed line/column index for `tree_text`. /// - /// Built once in [`set_text`] and reused by [`line_column_index`] to avoid re-scanning the - /// source on every deno op call (e.g. `namedChildren`, `parent`). - line_starts: Vec, + /// Built once in [`set_text`] and borrowed by [`line_column_index`] on every deno op call + /// (e.g. `namedChildren`, `parent`), avoiding repeated source scans. + lci: Option, /// A filename associated with a rule execution. filename: Option>, @@ -52,26 +52,21 @@ impl RootContext { /// Assigns the provided text string to the context. If an existing text string was assigned, it /// will be returned as `Some(text)`. /// - /// Also rebuilds the internal [`line_starts`](Self::line_column_index) cache so that subsequent - /// calls to [`line_column_index`](Self::line_column_index) are O(1). + /// Also rebuilds the internal [line/column index](Self::line_column_index) cache so that + /// subsequent calls to [`line_column_index`](Self::line_column_index) are O(1). pub fn set_text(&mut self, text: Arc) -> Option> { - self.line_starts = LineColumnIndex::compute_line_starts(&text); + self.lci = Some(LineColumnIndex::new(&text)); Option::replace(&mut self.tree_text, text) } - /// Returns a [`LineColumnIndex`] built from the cached line-start offsets. + /// Returns a reference to the cached [`LineColumnIndex`]. /// - /// Construction is O(1) β€” the line-start offsets are pre-computed in [`set_text`] and stored - /// as a `Vec`. This avoids the O(source_len) scan that `LineColumnIndex::new` would - /// perform on every deno op call. + /// The index is built once in [`set_text`] and reused on every deno op call, avoiding + /// repeated source scans. /// /// Returns `None` when no source text has been assigned yet. - pub fn line_column_index(&self) -> Option> { - let source = self.tree_text.as_deref()?; - Some(LineColumnIndex::from_parts( - source, - self.line_starts.clone(), - )) + pub fn line_column_index(&self) -> Option<&LineColumnIndex> { + self.lci.as_ref() } /// Returns a reference to the underlying [`tree_sitter::Tree`], if it exists. diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs index 72c925800..ca2e2a570 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs @@ -629,7 +629,7 @@ impl<'a> LocatedNode<'a> { pub fn new_cst( node: tree_sitter::Node, text: &'a str, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> LocatedNode<'a> { let start = node.start_position(); Self::Cst { diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs index b27858df8..996d0acaf 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs @@ -35,7 +35,7 @@ impl TreeSitterNode { pub fn from_ts_node_with_index( id: NodeId, node: tree_sitter::Node, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> Self { let tree_sitter::Point { row: start_row, diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index bec8db64f..8d3f9cff4 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -5,7 +5,6 @@ use crate::analysis::ddsa_lib; use crate::analysis::ddsa_lib::common::{v8_uint, NodeId}; use crate::analysis::ddsa_lib::{bridge, runtime, RawTSNode}; -use common::utils::position_utils::LineColumnIndex; use deno_core::{op2, v8, OpState}; use std::cell::RefCell; use std::rc::Rc; @@ -109,7 +108,7 @@ pub fn op_ts_node_named_children<'s>( // ts_node_bridge and ctx_bridge are separate RefCells so the concurrent borrows are valid. let ctx_bridge = state.borrow::>>(); let ctx_borrow = ctx_bridge.borrow(); - // Use the pre-computed line_starts cached in RootContext (O(1)) rather than + // Use the pre-computed index cached in RootContext (O(1)) rather than // rescanning the source on every namedChildren call. let idx = ctx_borrow .ddsa_root_context() @@ -130,7 +129,7 @@ pub fn op_ts_node_named_children<'s>( } let child_node = cursor.node(); - let nid = bridge_ref.insert(scope, child_node, &idx); + let nid = bridge_ref.insert(scope, child_node, idx); let nid = v8_uint(scope, nid); let nid_index = i * 2; array.set_index(scope, nid_index as u32, nid.into()); @@ -169,13 +168,13 @@ pub fn op_ts_node_parent( OpSafeRawTSNode::from_root_context(root_ctx, |ctx| ctx.get_ts_node_parent(ts_node))?; let parent_ts_node = safe_raw_parent.to_node(); - // Use the pre-computed line_starts cached in RootContext (O(1)) rather than + // Use the pre-computed index cached in RootContext (O(1)) rather than // rescanning the source on every parent call. let idx = root_ctx .line_column_index() .expect("tree text must be set before ops run"); let mut bridge_ref = ts_node_bridge.borrow_mut(); - let nid = bridge_ref.insert(scope, parent_ts_node, &idx); + let nid = bridge_ref.insert(scope, parent_ts_node, idx); Some(nid) } @@ -268,10 +267,11 @@ pub fn op_digraph_adjacency_list_to_dot( let text = root_ctx .get_text() .expect("tree text should always be `Some` during rule execution"); - // Build the index once before the per-vertex loop; use the cached line_starts when available. + // Use the pre-computed index cached in RootContext (O(1)) rather than rebuilding per-vertex. + // `set_text` always builds the index alongside `tree_text`, so this is always `Some` here. let lc_idx = root_ctx .line_column_index() - .unwrap_or_else(|| LineColumnIndex::new(text)); + .expect("tree text must be set before ops run"); // Transformation: // If `VertexKind::CST`: constructs a dot node from metadata from the `TsNodeBridge` and `RootContext`. @@ -289,7 +289,7 @@ pub fn op_digraph_adjacency_list_to_dot( let node_text = ts_node .utf8_text(text.as_bytes()) .expect("bytes should be utf8 sequence"); - Some(LocatedNode::new_cst(ts_node, node_text, &lc_idx)) + Some(LocatedNode::new_cst(ts_node, node_text, lc_idx)) } VertexKind::Phi => Some(LocatedNode::new_phi(vid.internal_id())), VertexKind::Invalid => None, diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index c2774a0c5..7b2cebc2f 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -344,11 +344,11 @@ pub fn get_query_nodes( // `idx` is a pre-built [`LineColumnIndex`] for the source that produced `node`; it is used to // convert tree-sitter's 0-based UTF-8 byte columns into 1-based UTF-16 code-unit columns so // that `Position.col` is consistent with LSP / VS Code / SARIF v2.1 semantics. -pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex<'_>) -> Option { +pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex) -> Option { fn map_node_internal( cursor: &mut tree_sitter::TreeCursor, only_named_node: bool, - idx: &LineColumnIndex<'_>, + idx: &LineColumnIndex, ) -> Option { // we do not map space, parenthesis and other non-named nodes if there // when `only_named_node` is true (which is `true` for children only). From 1f052b9f02077b7736f8d7ed51b96c4764212b26 Mon Sep 17 00:00:00 2001 From: Roberto Huertas Date: Wed, 20 May 2026 16:03:50 +0200 Subject: [PATCH 12/12] refactor: address review feedback on UTF-16 column conversion - Removed eight unit tests in position_utils.rs that duplicated coverage already provided by the line-index crate, collapsing them into a single wiring test that proves the wrapper delegates to LineIndex::to_wide(WideEncoding::Utf16, ..) and adds 1. - Dropped per-call-site multibyte tests covered by the new helper: test_export_csv_multibyte_col, sarif_region_carries_utf16_col, ts_node_line_col_multibyte, stella_compat_getcode_multibyte, test_map_node_multibyte_emoji, test_get_query_nodes_cjk, ts_node_bridge_multibyte_utf16_col, located_node_multibyte_col_is_utf16, and test_process_tree_sitter_tree_request_multibyte. - Trimmed LLM-flavored doc comments on insert_capture, RootContext (lci field, set_text, line_column_index), LocatedNode::new_cst, TreeSitterNode::from_ts_node_with_index, QueryMatchBridge::set_data, and map_node. - Removed noisy O(1)/pre-computed inline comments from three ops.rs call sites that explained self-evident caching behaviour. - Changed LineColumnIndex::byte_col_to_utf16_col to return Option instead of silently falling back to byte_col + 1, making the fallibility explicit at the type level. - Updated the five production call sites (tree_sitter.rs, js/ts_node.rs, js/flow/graph.rs) to use unwrap_or(byte_col as u32 + 1) so the ASCII fallback is visible at every caller, and updated test sites in js/flow/java.rs to use unwrap(). IDE-6037 --- crates/cli/src/csv.rs | 35 ----- crates/cli/src/sarif/sarif_utils.rs | 84 ------------ crates/common/src/utils/position_utils.rs | 128 ++---------------- .../analysis/ddsa_lib/bridge/query_match.rs | 4 - .../src/analysis/ddsa_lib/bridge/ts_node.rs | 34 ----- .../src/analysis/ddsa_lib/context/root.rs | 14 +- .../src/analysis/ddsa_lib/js/flow/graph.rs | 67 +-------- .../src/analysis/ddsa_lib/js/flow/java.rs | 4 +- .../src/analysis/ddsa_lib/js/ts_node.rs | 73 +--------- .../src/analysis/ddsa_lib/ops.rs | 6 - .../src/analysis/ddsa_lib/runtime.rs | 41 ------ .../src/analysis/tree_sitter.rs | 97 +------------ .../src/tree_sitter_tree.rs | 43 ------ 13 files changed, 31 insertions(+), 599 deletions(-) diff --git a/crates/cli/src/csv.rs b/crates/cli/src/csv.rs index 8b60c5a6a..b3da79d4f 100644 --- a/crates/cli/src/csv.rs +++ b/crates/cli/src/csv.rs @@ -97,39 +97,4 @@ mod tests { ); assert_eq!(res_with_result, "filename,rule,category,severity,message,start_line,start_col,end_line,end_col\nfilename,myrule,performance,error,message,10,12,12,10\n"); } - - /// The CSV output faithfully serializes whatever col value is stored in the violation. - /// When the kernel produces UTF-16 columns (e.g. col 3 for a node after "πŸš€"), the CSV row - /// contains 3, not the raw byte col (5). - #[test] - fn test_export_csv_multibyte_col() { - let res = generate_csv_results( - &vec![RuleResult { - rule_name: "myrule".to_string(), - filename: "file.py".to_string(), - violations: vec![Violation { - // Simulates a violation whose `col` was computed as UTF-16 (e.g. after πŸš€). - start: common::model::position::Position { line: 1, col: 3 }, - end: common::model::position::Position { line: 1, col: 16 }, - message: "multibyte".to_string(), - severity: RuleSeverity::Error, - category: RuleCategory::Security, - fixes: vec![], - taint_flow: None, - is_suppressed: false, - }], - errors: vec![], - execution_error: None, - output: None, - execution_time_ms: 1, - query_node_time_ms: 0, - parsing_time_ms: 0, - }], - &vec![], - ); - assert!( - res.contains(",1,3,1,16\n"), - "CSV should contain UTF-16 col 3; got: {res}" - ); - } } diff --git a/crates/cli/src/sarif/sarif_utils.rs b/crates/cli/src/sarif/sarif_utils.rs index 1bc844511..22cf71f19 100644 --- a/crates/cli/src/sarif/sarif_utils.rs +++ b/crates/cli/src/sarif/sarif_utils.rs @@ -2231,88 +2231,4 @@ mod tests { let sarif_json = serde_json::to_value(sarif_report).unwrap(); assert!(validate_data(&sarif_json)); } - - /// The SARIF `region.startColumn` / `region.endColumn` fields carry whatever value is in - /// `Position.col`. When the kernel emits UTF-16 columns (as it now does), the SARIF output - /// automatically conforms to the SARIF v2.1 default encoding (UTF-16). - /// - /// This test constructs a violation whose col was computed as UTF-16 (e.g. after an emoji on - /// the same line, col 3) and asserts the SARIF JSON contains that value. - #[test] - fn sarif_region_carries_utf16_col() { - let rule = RuleBuilder::default() - .name("utf16-col-rule".to_string()) - .description_base64(None) - .language(Language::Python) - .checksum("x".to_string()) - .pattern(None) - .tree_sitter_query_base64(None) - .category(RuleCategory::Security) - .code_base64(String::new()) - .short_description_base64(None) - .entity_checked(None) - .rule_type(RuleType::TreeSitterQuery) - .severity(RuleSeverity::Error) - .cwe(None) - .arguments(vec![]) - .tests(vec![]) - .is_testing(false) - .documentation_url(None) - .build() - .unwrap(); - let sarif_rule = SarifRule::StaticAnalysis(Box::new(rule)); - - // UTF-16 col 3 = position of an identifier after πŸš€ on the same line. - let violation = Violation { - start: Position::new(1, 3), - end: Position::new(1, 16), - message: "utf16 col test".to_string(), - severity: RuleSeverity::Error, - category: RuleCategory::Security, - fixes: vec![], - taint_flow: None, - is_suppressed: false, - }; - let rule_result = RuleResultBuilder::default() - .rule_name("utf16-col-rule".to_string()) - .filename("file.py".to_string()) - .violations(vec![violation]) - .errors(vec![]) - .execution_error(None) - .output(None) - .execution_time_ms(0u128) - .parsing_time_ms(0u128) - .query_node_time_ms(0u128) - .build() - .unwrap(); - let sarif_result = SarifRuleResult::try_from(rule_result).unwrap(); - - let directory = ".".to_string(); - let sarif = generate_sarif_report( - &[sarif_rule], - &[sarif_result], - &directory, - SarifReportMetadata { - add_git_info: false, - debug: false, - config_digest: String::new(), - diff_aware_parameters: None, - execution_time_secs: 0, - }, - &Default::default(), - ) - .expect("generate sarif"); - - let sarif_json = serde_json::to_value(&sarif).unwrap(); - let location = - &sarif_json["runs"][0]["results"][0]["locations"][0]["physicalLocation"]["region"]; - assert_eq!( - location["startColumn"], 3, - "SARIF startColumn should be UTF-16 col 3" - ); - assert_eq!( - location["endColumn"], 16, - "SARIF endColumn should be UTF-16 col 16" - ); - } } diff --git a/crates/common/src/utils/position_utils.rs b/crates/common/src/utils/position_utils.rs index 53575c228..9e3216536 100644 --- a/crates/common/src/utils/position_utils.rs +++ b/crates/common/src/utils/position_utils.rs @@ -29,17 +29,13 @@ impl LineColumnIndex { /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit /// column. /// - /// Falls back to `byte_col + 1` (correct for ASCII-only content) when the coordinates fall - /// outside the indexed source, preserving pre-existing behaviour for out-of-range inputs. - pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> u32 { + /// Returns `None` if `(row, byte_col)` falls outside the indexed source. + pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> Option { let lc = LineCol { line: row as u32, col: byte_col as u32, }; - self.0 - .to_wide(WideEncoding::Utf16, lc) - .map(|w| w.col + 1) - .unwrap_or((byte_col as u32) + 1) + self.0.to_wide(WideEncoding::Utf16, lc).map(|w| w.col + 1) } } @@ -179,115 +175,15 @@ mod tests { ); } - // ── LineColumnIndex tests ───────────────────────────────────────────────── - - #[test] - fn lci_ascii_fast_path() { - // Pure ASCII: UTF-16 col == byte col + 1 for every position. - let idx = LineColumnIndex::new("hello world\nfoo bar"); - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); // 'h' β†’ col 1 - assert_eq!(idx.byte_col_to_utf16_col(0, 5), 6); // ' ' β†’ col 6 - assert_eq!(idx.byte_col_to_utf16_col(1, 3), 4); // ' ' on line 2 β†’ col 4 - } - - #[test] - fn lci_bmp_non_ascii() { - // 'Γ©' is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit. - // Source: "cafΓ©\nend" β€” bytes: c(0) a(1) f(2) Γ©(3,4) \n(5) - let src = "caf\u{00E9}\nend"; - let idx = LineColumnIndex::new(src); - // byte_col 3 (start of Γ©) β†’ prefix "caf" = 3 UTF-16 units β†’ col 4 - assert_eq!(idx.byte_col_to_utf16_col(0, 3), 4); - // byte_col 5 (past Γ©, 3 + 2 bytes) β†’ prefix "cafΓ©" = 4 UTF-16 units β†’ col 5 - assert_eq!(idx.byte_col_to_utf16_col(0, 5), 5); - // line 1 is ASCII - assert_eq!(idx.byte_col_to_utf16_col(1, 3), 4); - } - - #[test] - fn lci_supplementary_plane_emoji() { - // 'πŸš€' is U+1F680: 4 UTF-8 bytes, 2 UTF-16 code units (surrogate pair). - // Source: "xπŸš€y" - // byte positions: x(0) πŸš€(1,2,3,4) y(5) - let src = "x\u{1F680}y"; - let idx = LineColumnIndex::new(src); - // byte_col 0 β†’ col 1 (before 'x') - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - // byte_col 1 β†’ prefix = "x" β†’ 1 UTF-16 unit β†’ col 2 - assert_eq!(idx.byte_col_to_utf16_col(0, 1), 2); - // byte_col 5 β†’ prefix = "xπŸš€" β†’ x(1) + πŸš€(2) = 3 UTF-16 units β†’ col 4 - assert_eq!(idx.byte_col_to_utf16_col(0, 5), 4); - // byte_col 6 β†’ prefix = "xπŸš€y" β†’ 3+1 = 4 UTF-16 units β†’ col 5 - assert_eq!(idx.byte_col_to_utf16_col(0, 6), 5); - } - - #[test] - fn lci_cjk() { - // 'ζ—₯' is U+65E5: 3 UTF-8 bytes, 1 UTF-16 code unit. - // Source: "ζ—₯本" - // bytes: ζ—₯(0,1,2) 本(3,4,5) - let src = "\u{65E5}\u{672C}"; // ζ—₯本 - let idx = LineColumnIndex::new(src); - // byte_col 0 β†’ col 1 - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - // byte_col 3 β†’ prefix = "ζ—₯" β†’ 1 UTF-16 unit β†’ col 2 - assert_eq!(idx.byte_col_to_utf16_col(0, 3), 2); - // byte_col 6 β†’ prefix = "ζ—₯本" β†’ 2 UTF-16 units β†’ col 3 - assert_eq!(idx.byte_col_to_utf16_col(0, 6), 3); - } - - #[test] - fn lci_combining_mark() { - // 'e' + U+0301 (combining acute accent): 2 codepoints, 2 UTF-16 code units, 1 grapheme. - // Source: "e\u{0301}x" - // bytes: e(0) \u{0301}(1,2) x(3) - let src = "e\u{0301}x"; - let idx = LineColumnIndex::new(src); - // byte_col 0 β†’ col 1 - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - // byte_col 1 β†’ prefix = "e" β†’ 1 UTF-16 unit β†’ col 2 - assert_eq!(idx.byte_col_to_utf16_col(0, 1), 2); - // byte_col 3 β†’ prefix = "e\u{0301}" β†’ 2 UTF-16 units β†’ col 3 - assert_eq!(idx.byte_col_to_utf16_col(0, 3), 3); - // byte_col 4 β†’ prefix = "e\u{0301}x" β†’ 3 UTF-16 units β†’ col 4 - assert_eq!(idx.byte_col_to_utf16_col(0, 4), 4); - } - - #[test] - fn lci_crlf_line_endings() { - // CRLF: tree-sitter counts \n as the line terminator and the col includes \r. - // Source: "ab\r\ncd" - // Line 0 bytes: a(0) b(1) \r(2) \n(3) β†’ line_start[1] = 4 - // Line 1 bytes: c(0 rel) d(1 rel) - let src = "ab\r\ncd"; - let idx = LineColumnIndex::new(src); - // Line 0 - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(0, 2), 3); // past "ab" β†’ 2+1=3 - // Line 1 - assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(1, 2), 3); - } - - #[test] - fn lci_eol_boundary() { - // byte_col == 0 on the start of any line β†’ col 1. - let src = "abc\ndef\nghi"; - let idx = LineColumnIndex::new(src); - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(2, 0), 1); - // byte_col at end of "abc" (3 bytes) β†’ col 4 - assert_eq!(idx.byte_col_to_utf16_col(0, 3), 4); - } - #[test] - fn lci_empty_line() { - // An empty line: byte_col 0 β†’ col 1. - let src = "\n\nfoo"; - let idx = LineColumnIndex::new(src); - assert_eq!(idx.byte_col_to_utf16_col(0, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(1, 0), 1); - assert_eq!(idx.byte_col_to_utf16_col(2, 0), 1); + fn byte_col_to_utf16_col_calls_to_wide_utf16() { + // Single source string with one non-ASCII char. We only need to prove our wrapper + // delegates to line_index::LineIndex::to_wide(WideEncoding::Utf16, ..) and adds 1. + // Exhaustive encoding cases live in the line-index crate's own test suite. + let idx = LineColumnIndex::new("a\u{65E5}b"); // a ζ—₯ b + // byte 0 ('a') β†’ 0 UTF-16 units before, +1 = 1 + assert_eq!(idx.byte_col_to_utf16_col(0, 0).unwrap(), 1); + // byte 4 ('b' after ζ—₯ which is 3 UTF-8 bytes / 1 UTF-16 unit) β†’ 2 UTF-16 units before, +1 = 3 + assert_eq!(idx.byte_col_to_utf16_col(0, 4).unwrap(), 3); } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs index 990dd5b58..f8b95f18d 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs @@ -34,10 +34,6 @@ impl QueryMatchBridge { /// Sets the bridge's data to the list of [`QueryMatch`]es, inserting tree-sitter nodes /// into the provided `TsNodeBridge`. /// - /// `idx` must be built from the same source string that produced the tree-sitter nodes in - /// `matches`; it is forwarded to [`TsNodeBridge::insert_capture`] so that `col` values - /// stored in v8 are 1-based UTF-16 code units. - /// /// NOTE: if the bridge had existing `QueryMatch`es, the tree-sitter nodes associated with them /// will not be removed from the `TsNodeBridge`. pub fn set_data<'tree>( diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs index 1e76229d7..24871b054 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs @@ -122,8 +122,6 @@ impl TsNodeBridge { /// Inserts the nodes from a [`TSQueryCapture`] into a v8 scope, consuming /// the `QueryCapture`. Returns a transformed `QueryCapture` containing the ids of the inserted /// nodes. - /// - /// `idx` is forwarded to [`insert`](Self::insert) for UTF-16 column conversion. pub fn insert_capture( &mut self, scope: &mut HandleScope, @@ -284,38 +282,6 @@ mod tests { assert!(bridge.v8_get(scope, 1).is_none()); } - /// Inserting a node from a multibyte source produces UTF-16 columns in the v8 object. - /// - /// Source: `"\u{1F680}"; abc = 1` β€” πŸš€ is 4 UTF-8 bytes / 2 UTF-16 code units inside a string. - /// `abc` starts at byte 8 (after `"πŸš€"; `), and its UTF-16 col must account for the emoji. - /// - /// Byte layout: `"` `πŸš€`(4 bytes) `"` `;` ` ` = 8 bytes β†’ `abc` at byte 8. - /// UTF-16 prefix: `"` + πŸš€(2 units) + `"` + `;` + ` ` = 6 units β†’ col 7. - #[test] - fn ts_node_bridge_multibyte_utf16_col() { - let (mut runtime, bridge) = setup_bridge(); - let scope = &mut runtime.handle_scope(); - let mut bridge = bridge.borrow_mut(); - - // Parse `"\u{1F680}"; abc = 1` as JavaScript. - let source = "\"\u{1F680}\"; abc = 1"; - let tree = TsTree::new(source, Language::JavaScript); - let idx = LineColumnIndex::new(source); - let abc = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; - // "πŸš€" (6 bytes: quote+4+quote) + "; " (2 bytes) = 8 bytes before `abc`. - assert_eq!(abc.start_position().column, 8, "byte col of `abc` is 8"); - - bridge.insert(scope, abc, &idx); - let v8_obj = bridge.v8_get(scope, 0).unwrap(); - - // Read `_startCol` from the v8 object. - let start_col = get_field::(v8_obj, "_startCol", scope, "integer") - .unwrap() - .value() as u32; - // UTF-16 units before abc: `"` (1) + πŸš€ (2 surrogate units) + `"` (1) + `;` (1) + ` ` (1) = 6 β†’ col 7. - assert_eq!(start_col, 7, "UTF-16 col of `abc` after πŸš€ should be 7"); - } - /// The text that the node spans can be retrieved. #[test] fn get_node_text() { diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs index c8f76e92a..a62e61179 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs @@ -19,9 +19,6 @@ pub struct RootContext { /// The source string that was parsed by tree-sitter to generate `tree`. tree_text: Option>, /// Pre-computed line/column index for `tree_text`. - /// - /// Built once in [`set_text`] and borrowed by [`line_column_index`] on every deno op call - /// (e.g. `namedChildren`, `parent`), avoiding repeated source scans. lci: Option, /// A filename associated with a rule execution. filename: Option>, @@ -51,20 +48,13 @@ impl RootContext { /// Assigns the provided text string to the context. If an existing text string was assigned, it /// will be returned as `Some(text)`. - /// - /// Also rebuilds the internal [line/column index](Self::line_column_index) cache so that - /// subsequent calls to [`line_column_index`](Self::line_column_index) are O(1). pub fn set_text(&mut self, text: Arc) -> Option> { self.lci = Some(LineColumnIndex::new(&text)); Option::replace(&mut self.tree_text, text) } - /// Returns a reference to the cached [`LineColumnIndex`]. - /// - /// The index is built once in [`set_text`] and reused on every deno op call, avoiding - /// repeated source scans. - /// - /// Returns `None` when no source text has been assigned yet. + /// Returns a reference to the cached [`LineColumnIndex`], or `None` if no source text has been + /// assigned yet. pub fn line_column_index(&self) -> Option<&LineColumnIndex> { self.lci.as_ref() } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs index ca2e2a570..5a62f2224 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs @@ -622,10 +622,6 @@ impl From> for dot_structures::Edge { impl<'a> LocatedNode<'a> { /// Constructs a new `LocatedNode` from a tree-sitter node. - /// - /// `idx` must be a [`LineColumnIndex`] built from the same source text that was parsed to - /// produce `node`. Pass a single index built at the call site β€” do **not** construct one - /// per-node, as that would scan the entire source on every call. pub fn new_cst( node: tree_sitter::Node, text: &'a str, @@ -635,7 +631,9 @@ impl<'a> LocatedNode<'a> { Self::Cst { text, line: start.row + 1, - col: idx.byte_col_to_utf16_col(start.row, start.column) as usize, + col: idx + .byte_col_to_utf16_col(start.row, start.column) + .unwrap_or(start.column as u32 + 1) as usize, cst_kind: node.kind(), } } @@ -1205,63 +1203,4 @@ strict digraph TestPhi { let expected = graphviz_rust::parse(expected).unwrap(); assert_eq!(collection.to_digraph(), expected); } - - /// **Regression-prevention test**: `LocatedNode::new_cst` emits a 1-based UTF-16 column that - /// differs from the raw `tree_sitter::Point.column + 1` value for multibyte input. - /// - /// If a future change reintroduces a raw `tree_sitter::Point.column + 1` read in - /// `LocatedNode::new_cst`, this test will fail on the multibyte assertion. - #[test] - fn located_node_multibyte_col_is_utf16() { - use crate::analysis::ddsa_lib::js::flow::graph::LocatedNode; - use crate::analysis::ddsa_lib::test_utils::TsTree; - use crate::model::common::Language; - use common::utils::position_utils::LineColumnIndex; - - // Source: "πŸš€var x = 1;" - // πŸš€ = 4 UTF-8 bytes, 2 UTF-16 code units. - // `x` identifier starts at byte 8 (4 for πŸš€, 4 for "var "). - // UTF-16 prefix: πŸš€(2) + v(1) + a(1) + r(1) + ' '(1) = 6 units β†’ col 7. - // Raw byte col of `x` = 8 β†’ raw byte col + 1 = 9. - // (These differ because πŸš€ = 4 bytes but 2 UTF-16 units.) - let source = "\u{1F680}var x = 1;"; - let tree = TsTree::new(source, Language::JavaScript); - - // Find the `x` identifier node. - let x_node = tree.find_named_nodes(Some("x"), Some("identifier"))[0]; - let raw_byte_col_plus_one = x_node.start_position().column + 1; - - // LocatedNode::new_cst must produce a UTF-16 col β‰  raw byte col + 1. - let idx = LineColumnIndex::new(source); - let located = LocatedNode::new_cst(x_node, tree.text(x_node), &idx); - let located_col = match located { - LocatedNode::Cst { col, .. } => col, - _ => panic!("expected Cst variant"), - }; - - // UTF-16 col should be smaller than raw byte col on multibyte input. - assert_ne!( - located_col, raw_byte_col_plus_one, - "LocatedNode col should be UTF-16, not raw byte col + 1 (raw={raw_byte_col_plus_one})" - ); - // Sanity check: for this specific source, UTF-16 col must be 7. - // prefix "πŸš€var " = 2+1+1+1+1 = 6 UTF-16 units β†’ col 7 - assert_eq!(located_col, 7, "UTF-16 col of `x` after πŸš€ should be 7"); - - // Additionally, verify that for an ASCII source the values agree (regression guard). - let ascii_source = "var x = 1;"; - let ascii_tree = TsTree::new(ascii_source, Language::JavaScript); - let ascii_x = ascii_tree.find_named_nodes(Some("x"), Some("identifier"))[0]; - let ascii_raw = ascii_x.start_position().column + 1; - let ascii_idx = LineColumnIndex::new(ascii_source); - let ascii_located = LocatedNode::new_cst(ascii_x, ascii_tree.text(ascii_x), &ascii_idx); - let ascii_col = match ascii_located { - LocatedNode::Cst { col, .. } => col, - _ => panic!("expected Cst variant"), - }; - assert_eq!( - ascii_col, ascii_raw, - "ASCII: UTF-16 col should equal raw byte col + 1" - ); - } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs index 8761aad60..9b47aabeb 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs @@ -424,9 +424,9 @@ serialized; let start = node.start_position(); let end = node.end_position(); region.start_line == (start.row as u32) + 1 - && region.start_col == idx.byte_col_to_utf16_col(start.row, start.column) + && region.start_col == idx.byte_col_to_utf16_col(start.row, start.column).unwrap() && region.end_line == (end.row as u32) + 1 - && region.end_col == idx.byte_col_to_utf16_col(end.row, end.column) + && region.end_col == idx.byte_col_to_utf16_col(end.row, end.column).unwrap() } // language=java diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs index 996d0acaf..77c56cbb5 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs @@ -27,11 +27,6 @@ pub struct TreeSitterNode { impl TreeSitterNode { /// Converts the provided [`tree_sitter::Node`] into a `TreeSitterNode`, assigning the /// provided id. - /// - /// Columns are produced as **1-based UTF-16 code-unit** values using `idx`, which must be - /// built from the same source string that was parsed to obtain `node`. Use this constructor - /// everywhere the source string is available β€” it is the single source of truth for - /// `Position.col` semantics. pub fn from_ts_node_with_index( id: NodeId, node: tree_sitter::Node, @@ -48,9 +43,13 @@ impl TreeSitterNode { Self { id, start_line: (start_row as u32) + 1, - start_col: idx.byte_col_to_utf16_col(start_row, start_byte_col), + start_col: idx + .byte_col_to_utf16_col(start_row, start_byte_col) + .unwrap_or(start_byte_col as u32 + 1), end_line: (end_row as u32) + 1, - end_col: idx.byte_col_to_utf16_col(end_row, end_byte_col), + end_col: idx + .byte_col_to_utf16_col(end_row, end_byte_col) + .unwrap_or(end_byte_col as u32 + 1), node_type_id: node.kind_id(), _pd: PhantomData, } @@ -178,66 +177,6 @@ mod tests { assert_eq!(tree_sitter_node.end_col, 15); } - /// Mirrors [`ts_node_line_col_one_based`] but with multibyte content. - /// - /// Source: `"\u{1F680}"; foo(bar);` - /// The emoji lives in a string literal so `foo` is a distinct identifier. - /// Bytes before `foo`: `"` (1) + πŸš€ (4) + `"` (1) + `;` (1) + ` ` (1) = 8 bytes. - /// UTF-16 prefix: `"` (1) + πŸš€ (2) + `"` (1) + `;` (1) + ` ` (1) = 6 units β†’ col 7. - #[test] - fn ts_node_line_col_multibyte() { - let lang = get_tree_sitter_language(&Language::JavaScript); - let mut parser = tree_sitter::Parser::new(); - parser.set_language(&lang).unwrap(); - - // Source: `"\u{1F680}"; foo(bar);` - // πŸš€ is placed inside a string literal so `foo` is clearly a separate identifier. - let source = "\"\u{1F680}\"; foo(bar);"; - let tree = parser.parse(source, None).unwrap(); - - // The second statement is `foo(bar);` - let root = tree.root_node(); - let call_node = root - .child(1) // second expression_statement - .unwrap() - .child(0) // call_expression - .unwrap(); - - // `foo` starts at byte 8 ("πŸš€" = 6 bytes + "; " = 2 bytes). - let fn_node = call_node.child_by_field_name("function").unwrap(); - assert_eq!(fn_node.start_position().column, 8, "byte col of `foo` is 8"); - - let idx = LineColumnIndex::new(source); - let ts_node = TreeSitterNode::::from_ts_node_with_index(0, fn_node, &idx); - assert_eq!(ts_node.start_line, 1); - // UTF-16 col 7: " (1) + πŸš€ surrogate pair (2) + " (1) + ; (1) + space (1) = 6 β†’ col 7. - assert_eq!( - ts_node.start_col, 7, - "UTF-16 col of `foo` should be 7 (emoji as string literal)" - ); - - // Also verify with a CJK prefix: `"ζ—₯"; foo(bar);` - // ζ—₯ = 3 UTF-8 bytes, 1 UTF-16 unit. - // Bytes before foo: " (1) + ζ—₯ (3) + " (1) + ; (1) + space (1) = 7 bytes. - // UTF-16: " (1) + ζ—₯ (1) + " (1) + ; (1) + space (1) = 5 units β†’ col 6. - let source_cjk = "\"\u{65E5}\"; foo(bar);"; - let tree_cjk = parser.parse(source_cjk, None).unwrap(); - let call_cjk = tree_cjk.root_node().child(1).unwrap().child(0).unwrap(); - let fn_cjk = call_cjk.child_by_field_name("function").unwrap(); - assert_eq!( - fn_cjk.start_position().column, - 7, - "byte col of `foo` after ζ—₯ is 7" - ); - let idx_cjk = LineColumnIndex::new(source_cjk); - let ts_node_cjk = TreeSitterNode::::from_ts_node_with_index(1, fn_cjk, &idx_cjk); - // UTF-16 col 6: " (1) + ζ—₯ (1) + " (1) + ; (1) + space (1) = 5 β†’ col 6. - assert_eq!( - ts_node_cjk.start_col, 6, - "UTF-16 col of `foo` after ζ—₯ should be 6" - ); - } - /// Tests that we use the [`tree_sitter::Node::kind_id`] instead of the [`tree_sitter::Node::grammar_id`], which /// has subtle differences. #[rustfmt::skip] diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index 8d3f9cff4..0baf22e77 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -108,8 +108,6 @@ pub fn op_ts_node_named_children<'s>( // ts_node_bridge and ctx_bridge are separate RefCells so the concurrent borrows are valid. let ctx_bridge = state.borrow::>>(); let ctx_borrow = ctx_bridge.borrow(); - // Use the pre-computed index cached in RootContext (O(1)) rather than - // rescanning the source on every namedChildren call. let idx = ctx_borrow .ddsa_root_context() .line_column_index() @@ -168,8 +166,6 @@ pub fn op_ts_node_parent( OpSafeRawTSNode::from_root_context(root_ctx, |ctx| ctx.get_ts_node_parent(ts_node))?; let parent_ts_node = safe_raw_parent.to_node(); - // Use the pre-computed index cached in RootContext (O(1)) rather than - // rescanning the source on every parent call. let idx = root_ctx .line_column_index() .expect("tree text must be set before ops run"); @@ -267,8 +263,6 @@ pub fn op_digraph_adjacency_list_to_dot( let text = root_ctx .get_text() .expect("tree text should always be `Some` during rule execution"); - // Use the pre-computed index cached in RootContext (O(1)) rather than rebuilding per-vertex. - // `set_text` always builds the index alongside `tree_text`, so this is always `Some` here. let lc_idx = root_ctx .line_column_index() .expect("tree text must be set before ops run"); diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs index b4322fb7f..ecbacda0d 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs @@ -1448,47 +1448,6 @@ function visit(query, filename, code) { assert_eq!(*violation, expected); } - /// `stella_compat.js::getCode` correctly extracts an identifier that follows a multibyte - /// character on the same line. - /// - /// The source is `"\u{1F680}"; protectedName = 1;` β€” the emoji sits inside a string literal - /// so that `protectedName` is a distinct, standalone identifier token. - /// Bytes before `protectedName`: `"` (1) + πŸš€ (4) + `"` (1) + `;` (1) + ` ` (1) = 8 bytes. - /// UTF-16 prefix: `"` (1) + πŸš€ surrogate pair (2) + `"` (1) + `;` (1) + ` ` (1) = 6 units β†’ col 7. - /// With UTF-16 columns, `col - 1` correctly indexes into the JS string via `substring`, - /// so `getCodeForNode(node, code)` returns `"protectedName"`. - #[test] - fn stella_compat_getcode_multibyte() { - let mut rt = cfg_test_v8().new_runtime(); - // πŸš€ is placed in a string literal so that `protectedName` is a distinct identifier. - // Bytes before `protectedName`: " (1) + πŸš€ (4) + " (1) + ; (1) + space (1) = 8 bytes. - // UTF-16 col: " (1) + πŸš€ surrogate pair (2) + " (1) + ; (1) + space (1) = 6 units β†’ col 7. - let text = "\"\u{1F680}\"; protectedName = 1;"; - let filename = "some_filename.js"; - let ts_query = r#"((identifier) @cap_name (#eq? @cap_name "protectedName"))"#; - let rule = r#" -function visit(query, filename, code) { - const node = query.captures["cap_name"]; - const nodeText = getCodeForNode(node, code); - const error = buildError( - node.start.line, - node.start.col, - node.end.line, - node.end.col, - nodeText - ); - addError(error); -} -"#; - let violations = - shorthand_execute_rule_internal(&mut rt, text, filename, ts_query, rule, None).unwrap(); - assert_eq!(violations.len(), 1); - let violation = violations.first().unwrap(); - assert_eq!(violation.message, "protectedName"); - // UTF-16 col 7: "(1) + πŸš€ surrogate pair(2) + "(1) + ;(1) + space(1) = 6 units β†’ col 7. - assert_eq!(violation.base_region.start_col, 7); - } - /// Passing a `VisitArgFilenameCompat` or `VisitArgCodeCompat` instance directly into console.log /// returns the contents of the underlying string. #[test] diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index 7b2cebc2f..16865a810 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -340,10 +340,6 @@ pub fn get_query_nodes( // map a node from the tree-sitter representation into our own internal representation // this is the representation that is passed to the JavaScript layer and how we represent // or expose the node to the end-user. -// -// `idx` is a pre-built [`LineColumnIndex`] for the source that produced `node`; it is used to -// convert tree-sitter's 0-based UTF-8 byte columns into 1-based UTF-16 code-unit columns so -// that `Position.col` is consistent with LSP / VS Code / SARIF v2.1 semantics. pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex) -> Option { fn map_node_internal( cursor: &mut tree_sitter::TreeCursor, @@ -380,11 +376,15 @@ pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex) -> Option 1. - n.start.col > 1 - }) - .map(|n| n.start.col) - .unwrap_or(0); - assert_eq!(end_col, 8, "UTF-16 col of `end` after CJK char should be 8"); - } - #[test] fn ts_query_cursor_matches_timeout() { let timeout = Duration::from_millis(500); diff --git a/crates/static-analysis-server/src/tree_sitter_tree.rs b/crates/static-analysis-server/src/tree_sitter_tree.rs index 905d6c8bc..c0d34d7a0 100644 --- a/crates/static-analysis-server/src/tree_sitter_tree.rs +++ b/crates/static-analysis-server/src/tree_sitter_tree.rs @@ -73,49 +73,6 @@ mod tests { assert_eq!("module", response.result.unwrap().ast_type); } - /// `process_tree_sitter_tree_request` emits 1-based UTF-16 columns for non-ASCII content. - /// - /// Source: `x = "πŸš€"; num = 5` - /// The emoji (πŸš€, U+1F680) is 4 UTF-8 bytes / 2 UTF-16 code units. `num` starts at byte 12, - /// so its UTF-16 col = 10 (units before it) + 1 = 11. - #[test] - fn test_process_tree_sitter_tree_request_multibyte() { - // Source: `x = "πŸš€"; num = 5` - // printf 'x = "\xF0\x9F\x9A\x80"; num = 5' | base64 - let request = TreeSitterRequest { - code_base64: "eCA9ICLwn5qAIjsgbnVtID0gNQ==".to_string(), - file_encoding: "utf-8".to_string(), - language: Language::Python, - }; - let response = process_tree_sitter_tree_request(request); - assert!(response.errors.is_empty()); - let root = response.result.unwrap(); - - /// Depth-first search for a node whose `start.col` matches `expected_col`. - fn find_col( - node: &crate::model::tree_sitter_tree_node::ServerTreeSitterNode, - expected_col: u32, - ) -> bool { - if node.start.col == expected_col { - return true; - } - node.children.iter().any(|c| find_col(c, expected_col)) - } - - // `num` starts at byte 12 in `x = "πŸš€"; num = 5`. - // UTF-16 units before it: x(1) ' '(1) =(1) ' '(1) "(1) πŸš€(2) "(1) ;(1) ' '(1) = 10 β†’ col 11. - assert!( - find_col(&root, 11), - "expected a node with UTF-16 start.col == 11 (position of `num` after πŸš€)" - ); - - // Ensure that no node reports col 13 (what the raw byte col + 1 would give for `num`). - assert!( - !find_col(&root, 13), - "no node should report raw byte col 13 β€” that would indicate the UTF-16 fix is not applied" - ); - } - #[test] fn test_process_tree_sitter_invalid_base64() { let request = TreeSitterRequest {