diff --git a/.github/scripts/check-regressions.js b/.github/scripts/check-regressions.js index 15bd5303a..0126f5fdd 100644 --- a/.github/scripts/check-regressions.js +++ b/.github/scripts/check-regressions.js @@ -22,7 +22,33 @@ const parseFile = (filepath) => { const current = {}; current.message = i.message; current.ruleId = i.ruleId; - current.physicalLocation = i.locations[0].physicalLocation; + // ------------------------------------------------------------------ + // IMPORTANT (TEMPORARY): strip `startColumn` / `endColumn` from the region. + // + // PR #914 changes the kernel from emitting 1-based UTF-8 byte columns to + // 1-based UTF-16 code-unit columns (matching LSP / VS Code / SARIF v2.1). + // + // While that PR is in review, `main` still emits byte columns and the + // feature branch emits UTF-16 columns. The regression check compares + // results as JSON strings, so it would flag every violation that lives + // on a line containing a non-ASCII character as a "removed + added" + // pair — same file, same rule, same line; only the column number drifts + // by N (where N is the number of multibyte chars before the position). + // + // To unblock CI for #914 we compare by + // (file, ruleId, message, startLine, endLine) + // and intentionally ignore columns. This is a one-shot loosening for + // the byte→UTF-16 transition. A stacked follow-up PR will restore + // `startColumn` / `endColumn` to the comparison key once #914 lands on + // `main` (at which point both runs are on UTF-16 columns again and + // column-level regression detection becomes meaningful again). + // ------------------------------------------------------------------ + const { startColumn: _startCol, endColumn: _endCol, ...regionWithoutColumns } = + i.locations[0].physicalLocation.region; + current.physicalLocation = { + ...i.locations[0].physicalLocation, + region: regionWithoutColumns, + }; results.push(current); } @@ -55,13 +81,16 @@ const main = async () => { let table1 = []; if (count1 > 0 || count2 > 0) { + // Location display only shows `startLine-endLine`. Columns are + // intentionally omitted to match the comparison key (see `parseFile` + // above for the full rationale on the byte→UTF-16 transition). for (const item of diff1) { const json = JSON.parse(item); table1.push([ { data: json.physicalLocation.artifactLocation.uri }, { data: json.message.text }, { data: json.ruleId }, - { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` }, + { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` }, { data: dupes1[json.ruleId] }, ]); } @@ -74,7 +103,7 @@ const main = async () => { { data: json.physicalLocation.artifactLocation.uri }, { data: json.message.text }, { data: json.ruleId }, - { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` }, + { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` }, { data: dupes2[json.ruleId] }, ]); } diff --git a/Cargo.lock b/Cargo.lock index 2761a58b5..2d0280439 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -696,6 +696,7 @@ dependencies = [ "anyhow", "bstr", "derive_builder", + "line-index", "serde", ] @@ -2540,6 +2541,16 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "line-index" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e27e0ed5a392a7f5ba0b3808a2afccff16c64933312c84b57618b49d1209bd2" +dependencies = [ + "nohash-hasher", + "text-size", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -2766,6 +2777,12 @@ dependencies = [ "smallvec", ] +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + [[package]] name = "nom" version = "7.1.3" diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 20a3e1316..543bdb646 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -12,6 +12,7 @@ graphviz-rust,https://github.com/besok/graphviz-rust,MIT,Copyright (c) 2013 Bori indexmap,https://github.com/indexmap-rs/indexmap,MIT and Apache-2.0,Copyright (c) 2016-2017 bluss indicatif,https://crates.io/crates/indicatif,MIT,Copyright (c) 2017 Armin Ronacher itertools,https://github.com/rust-itertools/itertools,MIT,Copyright 2015 itertools Developers +line-index,https://github.com/rust-lang/rust-analyzer,MIT OR Apache-2.0,Copyright (c) 2019 rust-analyzer developers num_cpus,https://github.com/seanmonstar/num_cpus,MIT, Copyright (c) 2015 Sean McArthur percent-encoding,https://github.com/servo/rust-url/,MIT, Copyright (c) 2013-2022 The rust-url developers prettytable-rs,https://github.com/phsym/prettytable-rs/,BSD-3-Clause,Copyright (c) 2022 Pierre-Henri Symoneaux diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 52512fccb..a5a4997f3 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -9,4 +9,5 @@ serde = { version = "1.0.219", features = ["derive"] } derive_builder = { workspace = true } # other -bstr = "1.12.0" \ No newline at end of file +bstr = "1.12.0" +line-index = "0.1.2" \ No newline at end of file diff --git a/crates/common/src/model/position.rs b/crates/common/src/model/position.rs index d7aa3d351..503b29c12 100644 --- a/crates/common/src/model/position.rs +++ b/crates/common/src/model/position.rs @@ -6,6 +6,18 @@ use derive_builder::Builder; use serde::{Deserialize, Serialize}; use std::fmt; +/// A source-code position. +/// +/// * `line` — 1-based line number. +/// * `col` — 1-based column number. The unit depends on the producer: +/// - **Static-analysis kernel** (tree-sitter path): UTF-16 code-unit column. Matches LSP, +/// VS Code, and the SARIF v2.1 default encoding. On ASCII-only lines, one UTF-16 code unit +/// equals one byte, so the value is identical to a byte-column. For non-ASCII characters +/// (CJK ideographs, emoji surrogate pairs, combining marks, etc.) the count reflects UTF-16 +/// surrogate-pair expansion. +/// - **Secrets scanner** (`get_position_in_string`): Unicode-grapheme-cluster column. A single +/// user-perceived character (including emoji and composed sequences) counts as one unit +/// regardless of its UTF-8 or UTF-16 size. #[derive(Deserialize, Debug, Serialize, Clone, Copy, Builder, PartialEq, Eq, Hash)] pub struct Position { pub line: u32, diff --git a/crates/common/src/utils/position_utils.rs b/crates/common/src/utils/position_utils.rs index 4458abfca..9e3216536 100644 --- a/crates/common/src/utils/position_utils.rs +++ b/crates/common/src/utils/position_utils.rs @@ -1,6 +1,43 @@ use crate::model::position::Position; use bstr::BStr; use bstr::ByteSlice; +use line_index::{LineCol, LineIndex, WideEncoding}; + +/// Precomputed per-line index for fast repeated UTF-8 byte-column → UTF-16 code-unit column +/// conversion. +/// +/// Build once per source string with [`LineColumnIndex::new`], then call +/// [`byte_col_to_utf16_col`](LineColumnIndex::byte_col_to_utf16_col) for every tree-sitter node +/// on that source. Backed by [`line_index::LineIndex`] from the rust-analyzer project. +/// +/// ## Line model +/// +/// [`line_index::LineIndex`] splits on `\n` only, which mirrors tree-sitter's line model exactly. +/// Tree-sitter does not treat a bare `\r` (classic Mac OS 9) as a line terminator; for Windows +/// `\r\n` files the `\r` is counted as part of the column on the same line, matching +/// tree-sitter's `Point.column` values. Using a broader splitter (e.g. Unicode line endings) +/// would diverge from tree-sitter and produce wrong UTF-16 columns. +#[derive(Debug)] +pub struct LineColumnIndex(LineIndex); + +impl LineColumnIndex { + /// Builds the index by scanning `source`. + pub fn new(source: &str) -> Self { + Self(LineIndex::new(source)) + } + + /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit + /// column. + /// + /// Returns `None` if `(row, byte_col)` falls outside the indexed source. + pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> Option { + let lc = LineCol { + line: row as u32, + col: byte_col as u32, + }; + self.0.to_wide(WideEncoding::Utf16, lc).map(|w| w.col + 1) + } +} /// Get position of an offset in a code and return a [Position]. pub fn get_position_in_string(content: &str, offset: usize) -> anyhow::Result { @@ -137,4 +174,16 @@ mod tests { Position::new(3, 13) ); } + + #[test] + fn byte_col_to_utf16_col_calls_to_wide_utf16() { + // Single source string with one non-ASCII char. We only need to prove our wrapper + // delegates to line_index::LineIndex::to_wide(WideEncoding::Utf16, ..) and adds 1. + // Exhaustive encoding cases live in the line-index crate's own test suite. + let idx = LineColumnIndex::new("a\u{65E5}b"); // a 日 b + // byte 0 ('a') → 0 UTF-16 units before, +1 = 1 + assert_eq!(idx.byte_col_to_utf16_col(0, 0).unwrap(), 1); + // byte 4 ('b' after 日 which is 3 UTF-8 bytes / 1 UTF-16 unit) → 2 UTF-16 units before, +1 = 3 + assert_eq!(idx.byte_col_to_utf16_col(0, 4).unwrap(), 3); + } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs index c41e1e236..f8b95f18d 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/query_match.rs @@ -7,6 +7,7 @@ use crate::analysis::ddsa_lib::common::{Class, DDSAJsRuntimeError, NodeId, Stell use crate::analysis::ddsa_lib::js; use crate::analysis::ddsa_lib::v8_ds::MirroredVec; use crate::analysis::tree_sitter::QueryMatch; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; @@ -40,6 +41,7 @@ impl QueryMatchBridge { scope: &mut HandleScope, matches: impl Into>>>, node_bridge: &mut TsNodeBridge, + idx: &LineColumnIndex, ) { let matches = matches.into(); // Pass each node in via the bridge (assigning it an id), and use this id to transform @@ -49,7 +51,7 @@ impl QueryMatchBridge { .map(|q_match| { q_match .into_iter() - .map(|capture| node_bridge.insert_capture(scope, capture)) + .map(|capture| node_bridge.insert_capture(scope, capture, idx)) .collect::>() .into() }) @@ -89,6 +91,7 @@ mod tests { use crate::analysis::ddsa_lib::v8_ds::MirroredVec; use crate::analysis::tree_sitter::{get_tree, QueryMatch, TSCaptureContent, TSQuery}; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::JsRuntime; fn setup_bridge() -> (JsRuntime, QueryMatchBridge, TsNodeBridge) { @@ -125,9 +128,10 @@ const ghi = 'hello' + ' world'; "; let query = TSQuery::try_new(&tree.language(), query).unwrap(); let matches = query.cursor().matches(tree.root_node(), text, None); + let idx = LineColumnIndex::new(text); assert!(query_match_bridge.is_empty()); assert!(ts_node_bridge.is_empty()); - query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge); + query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge, &idx); assert_eq!(query_match_bridge.len(), 3); assert_eq!(ts_node_bridge.len(), 3); @@ -139,16 +143,17 @@ const ghi = 'hello' + ' world'; } // The `QueryMatchBridge` doesn't clear nodes from `TsNodeBridge` when values change. - query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge); + query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge, &idx); assert_eq!(query_match_bridge.len(), 2); assert_eq!(ts_node_bridge.len(), 3); - let text = "\ + let text2 = "\ // Arbitrary JavaScript that contains `identifier` CST nodes const alpha = 'bravo'; "; - let tree = get_tree(text, &Language::JavaScript).unwrap(); - let matches = query.cursor().matches(tree.root_node(), text, None); - query_match_bridge.set_data(scope, matches, &mut ts_node_bridge); + let tree2 = get_tree(text2, &Language::JavaScript).unwrap(); + let matches2 = query.cursor().matches(tree2.root_node(), text2, None); + let idx2 = LineColumnIndex::new(text2); + query_match_bridge.set_data(scope, matches2, &mut ts_node_bridge, &idx2); assert_eq!(get_node_id_at_idx(&query_match_bridge, 0), 3); assert_eq!(ts_node_bridge.len(), 4); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs index d2740856d..24871b054 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/bridge/ts_node.rs @@ -7,6 +7,7 @@ use crate::analysis::ddsa_lib::js::TreeSitterNode; use crate::analysis::ddsa_lib::v8_ds::MirroredIndexMap; use crate::analysis::ddsa_lib::{js, RawTSNode}; use crate::analysis::tree_sitter::{TSCaptureContent, TSQueryCapture}; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; @@ -39,7 +40,16 @@ impl TsNodeBridge { /// Inserts a tree-sitter node into the bridge, returning the `NodeId` it was assigned. /// /// If the node already existed in the bridge, the existing `NodeId` will be returned. - pub fn insert(&mut self, scope: &mut HandleScope, node: tree_sitter::Node) -> NodeId { + /// + /// `idx` must be built from the same source string that produced `node`; it is forwarded to + /// [`TreeSitterNode::from_ts_node_with_index`] so that `col` values are 1-based UTF-16 + /// code units. + pub fn insert( + &mut self, + scope: &mut HandleScope, + node: tree_sitter::Node, + idx: &LineColumnIndex, + ) -> NodeId { let raw_ts_node = RawTSNode::new(node); if let Some((_, _, node_id)) = self.mirrored_im.get_full(&raw_ts_node) { *node_id @@ -48,7 +58,7 @@ impl TsNodeBridge { // to the map's current length. Because we use this as the `NodeId`, we can pre-assign // this, knowing this invariant (index == NodeId) will hold. let node_id = self.mirrored_im.len() as NodeId; - let v8_node = self.build_v8_node(scope, node, node_id); + let v8_node = self.build_v8_node(scope, node, node_id, idx); self.mirrored_im .insert_with(scope, raw_ts_node, node_id, |scope, _key, value| { // In Rust, we map from `tree_sitter::Node` to `NodeId`. @@ -76,14 +86,15 @@ impl TsNodeBridge { .map(|(raw_node, _)| raw_node) } - /// Serializes a [`tree_sitter::Node`] to v8. + /// Serializes a [`tree_sitter::Node`] to v8 using UTF-16 column values from `idx`. fn build_v8_node<'s>( &self, scope: &mut HandleScope<'s>, node: tree_sitter::Node, id: NodeId, + idx: &LineColumnIndex, ) -> v8::Local<'s, v8::Object> { - let ts_node = TreeSitterNode::::from_ts_node(id, node); + let ts_node = TreeSitterNode::::from_ts_node_with_index(id, node, idx); self.js_class.new_instance(scope, ts_node) } @@ -108,24 +119,26 @@ impl TsNodeBridge { self.mirrored_im.as_local(scope) } - /// Inserts the nodes from a [`TSQueryCapture`] into a v8 scope, consuming the `QueryCapture`. - /// Returns a transformed `QueryCapture` containing the ids of the inserted nodes. + /// Inserts the nodes from a [`TSQueryCapture`] into a v8 scope, consuming + /// the `QueryCapture`. Returns a transformed `QueryCapture` containing the ids of the inserted + /// nodes. pub fn insert_capture( &mut self, scope: &mut HandleScope, capture: TSQueryCapture, + idx: &LineColumnIndex, ) -> TSQueryCapture { TSQueryCapture:: { name: capture.name, contents: match capture.contents { TSCaptureContent::Single(node) => { - let nid = self.insert(scope, node); + let nid = self.insert(scope, node, idx); TSCaptureContent::Single(nid) } TSCaptureContent::Multi(nodes) => { let nids = nodes .into_iter() - .map(|node| self.insert(scope, node)) + .map(|node| self.insert(scope, node, idx)) .collect::>(); TSCaptureContent::Multi(nids) } @@ -158,6 +171,7 @@ mod tests { }; use crate::analysis::ddsa_lib::RawTSNode; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::cell::RefCell; @@ -205,22 +219,24 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; let bar = tree.find_named_nodes(Some("bar"), None)[0]; let baz = tree.find_named_nodes(Some("baz"), None)[0]; assert!(bridge.v8_get(scope, 0).is_none()); - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); let v8_tsn = bridge.v8_get(scope, 0).unwrap(); assert!(ts_node_eq(scope, v8_tsn, foo)); assert!(bridge.v8_get(scope, 1).is_none()); - assert_eq!(bridge.insert(scope, bar), 1); - assert_eq!(bridge.insert(scope, baz), 2); + assert_eq!(bridge.insert(scope, bar, &idx), 1); + assert_eq!(bridge.insert(scope, baz, &idx), 2); bridge.clear(scope); assert!(bridge.is_empty()); - assert_eq!(bridge.insert(scope, baz), 0); + assert_eq!(bridge.insert(scope, baz, &idx), 0); let v8_tsn = bridge.v8_get(scope, 0).unwrap(); assert!(ts_node_eq(scope, v8_tsn, baz)); } @@ -232,13 +248,15 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; let bar = tree.find_named_nodes(Some("bar"), None)[0]; let baz = tree.find_named_nodes(Some("baz"), None)[0]; for node in [foo, bar, baz] { - bridge.insert(scope, node); + bridge.insert(scope, node, &idx); } let bar_raw = RawTSNode::new(bar); @@ -252,13 +270,15 @@ mod tests { let scope = &mut runtime.handle_scope(); let mut bridge = bridge.borrow_mut(); - let tree = TsTree::new(r#"const val = foo(bar, baz);"#, Language::JavaScript); + let source = r#"const val = foo(bar, baz);"#; + let tree = TsTree::new(source, Language::JavaScript); + let idx = LineColumnIndex::new(source); let foo = tree.find_named_nodes(Some("foo"), None)[0]; - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); assert!(bridge.v8_get(scope, 0).is_some()); assert!(bridge.v8_get(scope, 1).is_none()); - assert_eq!(bridge.insert(scope, foo), 0); + assert_eq!(bridge.insert(scope, foo, &idx), 0); assert!(bridge.v8_get(scope, 1).is_none()); } @@ -271,6 +291,7 @@ const abc = 123; const def = 456; "; let tree = TsTree::new(file_contents, Language::JavaScript); + let idx = LineColumnIndex::new(file_contents); let file_contents = Arc::::from(file_contents); let file_name = Arc::::from("file_name.js"); @@ -284,8 +305,8 @@ const def = 456; .set_root_context(scope, &tree.tree(), &file_contents, &file_name); let node_0 = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; let node_1 = tree.find_named_nodes(Some("456"), Some("number"))[0]; - ts_node_bridge.borrow_mut().insert(scope, node_0); - ts_node_bridge.borrow_mut().insert(scope, node_1); + ts_node_bridge.borrow_mut().insert(scope, node_0, &idx); + ts_node_bridge.borrow_mut().insert(scope, node_1, &idx); let expected = [(0, "abc"), (1, "456")] .map(|(nid, text)| (format!("TS_NODES.get({}).text;", nid), text)); for (code, text) in &expected { @@ -311,6 +332,7 @@ const def = 456; let (mut runtime, ts_node_bridge) = setup_bridge(); let file_contents = "const abc = 123;"; let tree = TsTree::new(file_contents, Language::JavaScript); + let idx = LineColumnIndex::new(file_contents); let file_contents = Arc::::from(file_contents); let file_name = Arc::::from("file_name.js"); @@ -323,7 +345,7 @@ const def = 456; .borrow_mut() .set_root_context(scope, &tree.tree(), &file_contents, &file_name); let node_0 = tree.find_named_nodes(Some("abc"), Some("identifier"))[0]; - ts_node_bridge.borrow_mut().insert(scope, node_0); + ts_node_bridge.borrow_mut().insert(scope, node_0, &idx); let code = "\ const node = TS_NODES.get(0); diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs index 9e50d29f5..a62e61179 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/context/root.rs @@ -3,6 +3,7 @@ // Copyright 2024 Datadog, Inc. use crate::analysis::ddsa_lib::RawTSNode; +use common::utils::position_utils::LineColumnIndex; use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; @@ -17,6 +18,8 @@ pub struct RootContext { tree: Option>, /// The source string that was parsed by tree-sitter to generate `tree`. tree_text: Option>, + /// Pre-computed line/column index for `tree_text`. + lci: Option, /// A filename associated with a rule execution. filename: Option>, @@ -46,9 +49,16 @@ impl RootContext { /// Assigns the provided text string to the context. If an existing text string was assigned, it /// will be returned as `Some(text)`. pub fn set_text(&mut self, text: Arc) -> Option> { + self.lci = Some(LineColumnIndex::new(&text)); Option::replace(&mut self.tree_text, text) } + /// Returns a reference to the cached [`LineColumnIndex`], or `None` if no source text has been + /// assigned yet. + pub fn line_column_index(&self) -> Option<&LineColumnIndex> { + self.lci.as_ref() + } + /// Returns a reference to the underlying [`tree_sitter::Tree`], if it exists. pub fn get_tree(&self) -> Option<&Arc> { self.tree.as_ref() diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs index 06926b951..5a62f2224 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph.rs @@ -5,6 +5,7 @@ use crate::analysis::ddsa_lib::bridge::TsNodeBridge; use crate::analysis::ddsa_lib::common::{swallow_v8_error, v8_uint}; use crate::analysis::ddsa_lib::test_utils::TsTree; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use graphviz_rust::dot_structures; use graphviz_rust::dot_structures::{Attribute, Stmt}; @@ -621,11 +622,18 @@ impl From> for dot_structures::Edge { impl<'a> LocatedNode<'a> { /// Constructs a new `LocatedNode` from a tree-sitter node. - pub fn new_cst(node: tree_sitter::Node, text: &'a str) -> LocatedNode<'a> { + pub fn new_cst( + node: tree_sitter::Node, + text: &'a str, + idx: &LineColumnIndex, + ) -> LocatedNode<'a> { + let start = node.start_position(); Self::Cst { text, - line: node.start_position().row + 1, - col: node.start_position().column + 1, + line: start.row + 1, + col: idx + .byte_col_to_utf16_col(start.row, start.column) + .unwrap_or(start.column as u32 + 1) as usize, cst_kind: node.kind(), } } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs index a107790f6..7d5c4da12 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/graph_test_utils.rs @@ -12,6 +12,7 @@ use crate::analysis::ddsa_lib::js::flow::graph::{ VertexId, VertexKind, COL, CST_KIND, KIND, LINE, TEXT, V_KIND, }; use crate::analysis::ddsa_lib::test_utils::TsTree; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use graphviz_rust::dot_structures; @@ -79,9 +80,11 @@ pub fn cst_dot_digraph( let stmts = digraph_stmts(&graph); let tree = ts_tree.tree(); + // Build the index once before the per-node iterator to avoid O(source_len × nodes). + let lc_idx = LineColumnIndex::new(ts_tree.source()); let candidates = TsTree::preorder_nodes(root_node.unwrap_or(tree.root_node())) .iter() - .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node))) + .map(|&node| LocatedNode::new_cst(node, ts_tree.text(node), &lc_idx)) .collect::>(); // The `String` in the tuple is the original ID of the vertex (as specified in the DOT). diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs index 25d103d2b..9b47aabeb 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/flow/java.rs @@ -101,6 +101,7 @@ mod tests { use crate::model::common::Language; use crate::model::rule::{RuleCategory, RuleSeverity}; use common::model::position; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use std::sync::Arc; @@ -118,13 +119,14 @@ mod tests { pub(crate) fn setup(source_text: &str) -> (JsRuntime, TsTree) { let mut rt = cfg_test_v8().new_runtime(); let tree = TsTree::new(source_text, Language::Java); - let source_text = Arc::::from(source_text); + let idx = LineColumnIndex::new(source_text); + let source_arc = Arc::::from(source_text); let filename = Arc::::from("test_doesnt_use_filename"); - rt.bridge_context().borrow_mut().set_root_context(&mut rt.v8_handle_scope(), &tree.tree(), &source_text, &filename); + rt.bridge_context().borrow_mut().set_root_context(&mut rt.v8_handle_scope(), &tree.tree(), &source_arc, &filename); let tsn_bridge = rt.bridge_ts_node(); let mut tsn_bridge = tsn_bridge.borrow_mut(); for node in tree.find_named_nodes(None, None) { - tsn_bridge.insert(&mut rt.v8_handle_scope(), node); + tsn_bridge.insert(&mut rt.v8_handle_scope(), node, &idx); } (rt, tree) @@ -413,11 +415,18 @@ serialized; #[test] fn violation_taint_flow_regions() { let v_converter = ViolationConverter::new(); - fn position_eq(region: js::CodeRegion, node: tree_sitter::Node) -> bool { - region.start_line == (node.start_position().row as u32) + 1 - && region.start_col == (node.start_position().column as u32) + 1 - && region.end_line == (node.end_position().row as u32) + 1 - && region.end_col == (node.end_position().column as u32) + 1 + fn position_eq( + region: js::CodeRegion, + node: tree_sitter::Node, + source: &str, + ) -> bool { + let idx = LineColumnIndex::new(source); + let start = node.start_position(); + let end = node.end_position(); + region.start_line == (start.row as u32) + 1 + && region.start_col == idx.byte_col_to_utf16_col(start.row, start.column).unwrap() + && region.end_line == (end.row as u32) + 1 + && region.end_col == idx.byte_col_to_utf16_col(end.row, end.column).unwrap() } // language=java @@ -465,7 +474,7 @@ v; ) .unwrap(); - assert!(position_eq(violation.base_region, expected_flow[0])); + assert!(position_eq(violation.base_region, expected_flow[0], code)); let taint_flow_regions = violation.taint_flow_regions.unwrap(); assert_eq!(taint_flow_regions.len(), expected_flow.len()); @@ -473,7 +482,7 @@ v; .iter() .zip(expected_flow) .for_each(|(®ion, node)| { - assert!(position_eq(region, node)); + assert!(position_eq(region, node, code)); }); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs index 273133e44..77c56cbb5 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/js/ts_node.rs @@ -5,6 +5,7 @@ use crate::analysis::ddsa_lib::common::{ load_function, swallow_v8_error, v8_uint, Class, DDSAJsRuntimeError, Instance, NodeId, }; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::marker::PhantomData; @@ -24,26 +25,31 @@ pub struct TreeSitterNode { } impl TreeSitterNode { - /// Converts the provided [`tree_sitter::Node`] into a `TreeSitterNode`, assigning the provided id. - pub fn from_ts_node(id: NodeId, node: tree_sitter::Node) -> Self { - // NOTE: We normalize the 0-based `tree_sitter::Point` to be 1-based. - fn normalize_ts_point_num(num: usize) -> u32 { - num as u32 + 1 - } + /// Converts the provided [`tree_sitter::Node`] into a `TreeSitterNode`, assigning the + /// provided id. + pub fn from_ts_node_with_index( + id: NodeId, + node: tree_sitter::Node, + idx: &LineColumnIndex, + ) -> Self { let tree_sitter::Point { - row: start_line, - column: start_col, + row: start_row, + column: start_byte_col, } = node.start_position(); let tree_sitter::Point { - row: end_line, - column: end_col, + row: end_row, + column: end_byte_col, } = node.end_position(); Self { id, - start_line: normalize_ts_point_num(start_line), - start_col: normalize_ts_point_num(start_col), - end_line: normalize_ts_point_num(end_line), - end_col: normalize_ts_point_num(end_col), + start_line: (start_row as u32) + 1, + start_col: idx + .byte_col_to_utf16_col(start_row, start_byte_col) + .unwrap_or(start_byte_col as u32 + 1), + end_line: (end_row as u32) + 1, + end_col: idx + .byte_col_to_utf16_col(end_row, end_byte_col) + .unwrap_or(end_byte_col as u32 + 1), node_type_id: node.kind_id(), _pd: PhantomData, } @@ -104,6 +110,7 @@ mod tests { }; use crate::analysis::tree_sitter::get_tree_sitter_language; use crate::model::common::Language; + use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use std::marker::PhantomData; use tree_sitter::StreamingIterator; @@ -152,7 +159,8 @@ mod tests { let mut parser = tree_sitter::Parser::new(); parser.set_language(&lang).unwrap(); - let tree = parser.parse(r#"foo(bar, baz);"#, None).unwrap(); + let source = r#"foo(bar, baz);"#; + let tree = parser.parse(source, None).unwrap(); let root = tree.root_node(); assert_eq!(root.start_position().row, 0); @@ -160,7 +168,8 @@ mod tests { assert_eq!(root.end_position().row, 0); assert_eq!(root.end_position().column, 14); - let tree_sitter_node = TreeSitterNode::::from_ts_node(0, root); + let idx = LineColumnIndex::new(source); + let tree_sitter_node = TreeSitterNode::::from_ts_node_with_index(0, root, &idx); assert_eq!(tree_sitter_node.start_line, 1); assert_eq!(tree_sitter_node.start_col, 1); @@ -188,7 +197,8 @@ mod tests { // This should resolve to `assert_ne!("identifier", "property_identifier")` assert_ne!(ts_node.grammar_name(), ts_node.kind()); - let tree_sitter_node = TreeSitterNode::::from_ts_node(0, ts_node); + let idx = LineColumnIndex::new(text); + let tree_sitter_node = TreeSitterNode::::from_ts_node_with_index(0, ts_node, &idx); assert_eq!(tree_sitter_node.node_type_id, ts_node.kind_id()); } diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs index 3fe23047c..0baf22e77 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/ops.rs @@ -104,6 +104,14 @@ pub fn op_ts_node_named_children<'s>( None } else { let array = v8::Array::new(scope, count as i32); + // ctx_bridge is only needed in this branch. Build the index while ctx_borrow is alive; + // ts_node_bridge and ctx_bridge are separate RefCells so the concurrent borrows are valid. + let ctx_bridge = state.borrow::>>(); + let ctx_borrow = ctx_bridge.borrow(); + let idx = ctx_borrow + .ddsa_root_context() + .line_column_index() + .expect("tree text must be set before ops run"); let mut bridge_ref = ts_node_bridge.borrow_mut(); let mut cursor = ts_node.walk(); @@ -119,7 +127,7 @@ pub fn op_ts_node_named_children<'s>( } let child_node = cursor.node(); - let nid = bridge_ref.insert(scope, child_node); + let nid = bridge_ref.insert(scope, child_node, idx); let nid = v8_uint(scope, nid); let nid_index = i * 2; array.set_index(scope, nid_index as u32, nid.into()); @@ -152,14 +160,17 @@ pub fn op_ts_node_parent( let safe_raw_ts_node = OpSafeRawTSNode::from_tsn_bridge(&ts_node_bridge.borrow(), node_id)?; let ts_node = safe_raw_ts_node.to_node(); - let ctx_bridge = ctx_bridge.borrow_mut(); - let root_ctx = ctx_bridge.ddsa_root_context(); + let ctx_borrow = ctx_bridge.borrow(); + let root_ctx = ctx_borrow.ddsa_root_context(); let safe_raw_parent = OpSafeRawTSNode::from_root_context(root_ctx, |ctx| ctx.get_ts_node_parent(ts_node))?; let parent_ts_node = safe_raw_parent.to_node(); + let idx = root_ctx + .line_column_index() + .expect("tree text must be set before ops run"); let mut bridge_ref = ts_node_bridge.borrow_mut(); - let nid = bridge_ref.insert(scope, parent_ts_node); + let nid = bridge_ref.insert(scope, parent_ts_node, idx); Some(nid) } @@ -248,10 +259,13 @@ pub fn op_digraph_adjacency_list_to_dot( let tsn_bridge = tsn_bridge.borrow(); let ctx_bridge = state.borrow::>>(); let ctx_bridge = ctx_bridge.borrow(); - let text = ctx_bridge - .ddsa_root_context() + let root_ctx = ctx_bridge.ddsa_root_context(); + let text = root_ctx .get_text() .expect("tree text should always be `Some` during rule execution"); + let lc_idx = root_ctx + .line_column_index() + .expect("tree text must be set before ops run"); // Transformation: // If `VertexKind::CST`: constructs a dot node from metadata from the `TsNodeBridge` and `RootContext`. @@ -269,7 +283,7 @@ pub fn op_digraph_adjacency_list_to_dot( let node_text = ts_node .utf8_text(text.as_bytes()) .expect("bytes should be utf8 sequence"); - Some(LocatedNode::new_cst(ts_node, node_text)) + Some(LocatedNode::new_cst(ts_node, node_text, lc_idx)) } VertexKind::Phi => Some(LocatedNode::new_phi(vid.internal_id())), VertexKind::Invalid => None, diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs index 1c6a75a01..ecbacda0d 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/runtime.rs @@ -15,6 +15,7 @@ use crate::analysis::ddsa_lib::resource_watchdog::V8ResourceWatchdog; use crate::model::common::Language; use crate::model::rule::RuleInternal; use crate::model::violation; +use common::utils::position_utils::LineColumnIndex; use deno_core::v8; use deno_core::v8::HandleScope; use std::cell::RefCell; @@ -263,10 +264,11 @@ impl JsRuntime { // Add any rule arguments ctx_bridge.set_rule_arguments(scope, rule_arguments); - // Push the query matches: + // Push the query matches, using UTF-16 column conversion for every node. + let idx = LineColumnIndex::new(source_text.as_ref()); let mut ts_node_bridge = self.bridge_ts_node.borrow_mut(); self.bridge_query_match - .set_data(scope, query_matches, &mut ts_node_bridge); + .set_data(scope, query_matches, &mut ts_node_bridge, &idx); } // We use a bridge to pull violations, so we can ignore the return value with a noop handler. diff --git a/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs b/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs index 821065f78..ac9d71db1 100644 --- a/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs +++ b/crates/static-analysis-kernel/src/analysis/ddsa_lib/test_utils.rs @@ -395,6 +395,11 @@ impl TsTree { Arc::clone(&self.tree) } + /// Returns the full source text. + pub fn source(&self) -> &str { + &self.text + } + /// Returns the text for the provided node. pub fn text(&self, node: tree_sitter::Node) -> &str { node.utf8_text(self.text.as_bytes()).unwrap() diff --git a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs index b9fd80ba2..16865a810 100644 --- a/crates/static-analysis-kernel/src/analysis/tree_sitter.rs +++ b/crates/static-analysis-kernel/src/analysis/tree_sitter.rs @@ -1,6 +1,7 @@ use crate::model::analysis::{MatchNode, MatchNodeContext, TreeSitterNode}; use crate::model::common::Language; use common::model::position::Position; +use common::utils::position_utils::LineColumnIndex; use indexmap::IndexMap; use std::collections::HashMap; use std::sync::Arc; @@ -297,17 +298,20 @@ pub fn get_query_nodes( ) -> Vec { let mut match_nodes: Vec = vec![]; + let idx = LineColumnIndex::new(code); + for query_match in query.cursor().matches(tree.root_node(), code, None) { let mut captures: HashMap = HashMap::new(); let mut captures_list: HashMap> = HashMap::new(); for capture in query_match { let list = match capture.contents { TSCaptureContent::Single(node) => { - map_node(node).map(|n| vec![n]).unwrap_or_default() - } - TSCaptureContent::Multi(nodes) => { - nodes.into_iter().filter_map(map_node).collect::>() + map_node(node, &idx).map(|n| vec![n]).unwrap_or_default() } + TSCaptureContent::Multi(nodes) => nodes + .into_iter() + .filter_map(|n| map_node(n, &idx)) + .collect::>(), }; // All captures are inserted into `captures_list`. However, the prior implementation continually // called `insert` on the `captures` map, which ended up re-writing the value every time. @@ -336,10 +340,11 @@ pub fn get_query_nodes( // map a node from the tree-sitter representation into our own internal representation // this is the representation that is passed to the JavaScript layer and how we represent // or expose the node to the end-user. -pub fn map_node(node: tree_sitter::Node) -> Option { +pub fn map_node(node: tree_sitter::Node, idx: &LineColumnIndex) -> Option { fn map_node_internal( cursor: &mut tree_sitter::TreeCursor, only_named_node: bool, + idx: &LineColumnIndex, ) -> Option { // we do not map space, parenthesis and other non-named nodes if there // when `only_named_node` is true (which is `true` for children only). @@ -352,7 +357,7 @@ pub fn map_node(node: tree_sitter::Node) -> Option { if cursor.goto_first_child() { loop { // For the child, we only want to capture named nodes to avoid polluting the AST. - let maybe_child = map_node_internal(cursor, true); + let maybe_child = map_node_internal(cursor, true, idx); if let Some(child) = maybe_child { children.push(child); } @@ -363,16 +368,23 @@ pub fn map_node(node: tree_sitter::Node) -> Option { cursor.goto_parent(); } + let start_point = cursor.node().range().start_point; + let end_point = cursor.node().range().end_point; + // finally, build the return value. let ts_node = TreeSitterNode { ast_type: cursor.node().kind().to_string(), start: Position { - line: u32::try_from(cursor.node().range().start_point.row + 1).unwrap(), - col: u32::try_from(cursor.node().range().start_point.column + 1).unwrap(), + line: u32::try_from(start_point.row + 1).unwrap(), + col: idx + .byte_col_to_utf16_col(start_point.row, start_point.column) + .unwrap_or(start_point.column as u32 + 1), }, end: Position { - line: u32::try_from(cursor.node().range().end_point.row + 1).unwrap(), - col: u32::try_from(cursor.node().range().end_point.column + 1).unwrap(), + line: u32::try_from(end_point.row + 1).unwrap(), + col: idx + .byte_col_to_utf16_col(end_point.row, end_point.column) + .unwrap_or(end_point.column as u32 + 1), }, field_name: cursor.field_name().map(ToString::to_string), children, @@ -385,7 +397,7 @@ pub fn map_node(node: tree_sitter::Node) -> Option { // Initially, we capture both un/named nodes to allow capturing unnamed node from // the tree-sitter query. - map_node_internal(&mut ts_cursor, false) + map_node_internal(&mut ts_cursor, false, idx) } #[cfg(test)] @@ -413,7 +425,8 @@ def func(): pass;"#; let t = get_tree(source_code, &Language::Python); assert!(t.is_some()); - let tree_node = map_node(t.unwrap().root_node()); + let idx = LineColumnIndex::new(source_code); + let tree_node = map_node(t.unwrap().root_node(), &idx); assert!(tree_node.is_some()); let root = tree_node.unwrap(); assert_eq!(2, root.children.len()); diff --git a/crates/static-analysis-server/src/tree_sitter_tree.rs b/crates/static-analysis-server/src/tree_sitter_tree.rs index 1295c9475..c0d34d7a0 100644 --- a/crates/static-analysis-server/src/tree_sitter_tree.rs +++ b/crates/static-analysis-server/src/tree_sitter_tree.rs @@ -2,6 +2,7 @@ use crate::constants::{ERROR_CODE_NOT_BASE64, ERROR_CODE_NO_ROOT_NODE}; use crate::model::tree_sitter_tree_node::ServerTreeSitterNode; use crate::model::tree_sitter_tree_request::TreeSitterRequest; use crate::model::tree_sitter_tree_response::TreeSitterResponse; +use common::utils::position_utils::LineColumnIndex; use kernel::analysis::tree_sitter::{get_tree, map_node}; use kernel::utils::decode_base64_string; @@ -38,9 +39,10 @@ pub fn process_tree_sitter_tree_request(request: TreeSitterRequest) -> TreeSitte return no_root_node; } - if let Some(result) = - result.map(|tree| map_node(tree.root_node()).map(ServerTreeSitterNode::from)) - { + if let Some(result) = result.map(|tree| { + let idx = LineColumnIndex::new(&decoded); + map_node(tree.root_node(), &idx).map(ServerTreeSitterNode::from) + }) { tracing::info!("Successfully completed tree-sitter AST generation"); TreeSitterResponse { result,