DataDog · robertohuertasm-datadog · May 20, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
@@ -22,7 +22,33 @@ const parseFile = (filepath) => {
       const current = {};
       current.message = i.message;
       current.ruleId = i.ruleId;
-      current.physicalLocation = i.locations[0].physicalLocation;
+      // ------------------------------------------------------------------
+      // IMPORTANT (TEMPORARY): strip `startColumn` / `endColumn` from the region.
+      //
+      // PR #914 changes the kernel from emitting 1-based UTF-8 byte columns to
+      // 1-based UTF-16 code-unit columns (matching LSP / VS Code / SARIF v2.1).
+      //
+      // While that PR is in review, `main` still emits byte columns and the
+      // feature branch emits UTF-16 columns. The regression check compares
+      // results as JSON strings, so it would flag every violation that lives
+      // on a line containing a non-ASCII character as a "removed + added"
+      // pair — same file, same rule, same line; only the column number drifts
+      // by N (where N is the number of multibyte chars before the position).
+      //
+      // To unblock CI for #914 we compare by
+      //     (file, ruleId, message, startLine, endLine)
+      // and intentionally ignore columns. This is a one-shot loosening for
+      // the byte→UTF-16 transition. A stacked follow-up PR will restore
+      // `startColumn` / `endColumn` to the comparison key once #914 lands on
+      // `main` (at which point both runs are on UTF-16 columns again and
+      // column-level regression detection becomes meaningful again).
+      // ------------------------------------------------------------------
+      const { startColumn: _startCol, endColumn: _endCol, ...regionWithoutColumns } =
+        i.locations[0].physicalLocation.region;
+      current.physicalLocation = {
+        ...i.locations[0].physicalLocation,
+        region: regionWithoutColumns,
+      };
       results.push(current);
     }
 
@@ -55,13 +81,16 @@ const main = async () => {
     let table1 = [];
 
     if (count1 > 0 || count2 > 0) {
+      // Location display only shows `startLine-endLine`. Columns are
+      // intentionally omitted to match the comparison key (see `parseFile`
+      // above for the full rationale on the byte→UTF-16 transition).
       for (const item of diff1) {
         const json = JSON.parse(item);
         table1.push([
           { data: json.physicalLocation.artifactLocation.uri },
           { data: json.message.text },
           { data: json.ruleId },
-          { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` },
+          { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` },
           { data: dupes1[json.ruleId] },
         ]);
       }
@@ -74,7 +103,7 @@ const main = async () => {
           { data: json.physicalLocation.artifactLocation.uri },
           { data: json.message.text },
           { data: json.ruleId },
-          { data: `${json.physicalLocation.region.startLine}:${json.physicalLocation.region.startColumn}-${json.physicalLocation.region.endLine}:${json.physicalLocation.region.endColumn}` },
+          { data: `${json.physicalLocation.region.startLine}-${json.physicalLocation.region.endLine}` },
           { data: dupes2[json.ruleId] },
         ]);
       }

@@ -12,6 +12,7 @@ graphviz-rust,https://github.com/besok/graphviz-rust,MIT,Copyright (c) 2013 Bori
 indexmap,https://github.com/indexmap-rs/indexmap,MIT and Apache-2.0,Copyright (c) 2016-2017 bluss
 indicatif,https://crates.io/crates/indicatif,MIT,Copyright (c) 2017 Armin Ronacher <armin.ronacher@active-4.com>
 itertools,https://github.com/rust-itertools/itertools,MIT,Copyright 2015 itertools Developers
+line-index,https://github.com/rust-lang/rust-analyzer,MIT OR Apache-2.0,Copyright (c) 2019 rust-analyzer developers
 num_cpus,https://github.com/seanmonstar/num_cpus,MIT, Copyright (c) 2015 Sean McArthur
 percent-encoding,https://github.com/servo/rust-url/,MIT, Copyright (c) 2013-2022 The rust-url developers
 prettytable-rs,https://github.com/phsym/prettytable-rs/,BSD-3-Clause,Copyright (c) 2022 Pierre-Henri Symoneaux

@@ -9,4 +9,5 @@ serde = { version = "1.0.219", features = ["derive"] }
 derive_builder = { workspace = true }
 
 # other
-bstr = "1.12.0"
+bstr = "1.12.0"
+line-index = "0.1.2"
@@ -6,6 +6,18 @@ use derive_builder::Builder;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 
+/// A source-code position.
+///
+/// * `line` — 1-based line number.
+/// * `col` — 1-based column number.  The unit depends on the producer:
+///   - **Static-analysis kernel** (tree-sitter path): UTF-16 code-unit column.  Matches LSP,
+///     VS Code, and the SARIF v2.1 default encoding.  On ASCII-only lines, one UTF-16 code unit
+///     equals one byte, so the value is identical to a byte-column.  For non-ASCII characters
+///     (CJK ideographs, emoji surrogate pairs, combining marks, etc.) the count reflects UTF-16
+///     surrogate-pair expansion.
+///   - **Secrets scanner** (`get_position_in_string`): Unicode-grapheme-cluster column.  A single
+///     user-perceived character (including emoji and composed sequences) counts as one unit
+///     regardless of its UTF-8 or UTF-16 size.
 #[derive(Deserialize, Debug, Serialize, Clone, Copy, Builder, PartialEq, Eq, Hash)]
 pub struct Position {
     pub line: u32,

@@ -1,6 +1,43 @@
 use crate::model::position::Position;
 use bstr::BStr;
 use bstr::ByteSlice;
+use line_index::{LineCol, LineIndex, WideEncoding};
+
+/// Precomputed per-line index for fast repeated UTF-8 byte-column → UTF-16 code-unit column
+/// conversion.
+///
+/// Build once per source string with [`LineColumnIndex::new`], then call
+/// [`byte_col_to_utf16_col`](LineColumnIndex::byte_col_to_utf16_col) for every tree-sitter node
+/// on that source. Backed by [`line_index::LineIndex`] from the rust-analyzer project.
+///
+/// ## Line model
+///
+/// [`line_index::LineIndex`] splits on `\n` only, which mirrors tree-sitter's line model exactly.
+/// Tree-sitter does not treat a bare `\r` (classic Mac OS 9) as a line terminator; for Windows
+/// `\r\n` files the `\r` is counted as part of the column on the same line, matching
+/// tree-sitter's `Point.column` values. Using a broader splitter (e.g. Unicode line endings)
+/// would diverge from tree-sitter and produce wrong UTF-16 columns.
+#[derive(Debug)]
+pub struct LineColumnIndex(LineIndex);
+
+impl LineColumnIndex {
+    /// Builds the index by scanning `source`.
+    pub fn new(source: &str) -> Self {
+        Self(LineIndex::new(source))
+    }
+
+    /// Converts a tree-sitter 0-based `(row, byte_col)` point to a 1-based UTF-16 code-unit
+    /// column.
+    ///
+    /// Returns `None` if `(row, byte_col)` falls outside the indexed source.
+    pub fn byte_col_to_utf16_col(&self, row: usize, byte_col: usize) -> Option<u32> {
+        let lc = LineCol {
+            line: row as u32,
+            col: byte_col as u32,
+        };
+        self.0.to_wide(WideEncoding::Utf16, lc).map(|w| w.col + 1)
+    }
+}
 
 /// Get position of an offset in a code and return a [Position].
 pub fn get_position_in_string(content: &str, offset: usize) -> anyhow::Result<Position> {
@@ -137,4 +174,16 @@ mod tests {
             Position::new(3, 13)
         );
     }
+
+    #[test]
+    fn byte_col_to_utf16_col_calls_to_wide_utf16() {
+        // Single source string with one non-ASCII char. We only need to prove our wrapper
+        // delegates to line_index::LineIndex::to_wide(WideEncoding::Utf16, ..) and adds 1.
+        // Exhaustive encoding cases live in the line-index crate's own test suite.
+        let idx = LineColumnIndex::new("a\u{65E5}b"); // a 日 b
+                                                      // byte 0 ('a') → 0 UTF-16 units before, +1 = 1
+        assert_eq!(idx.byte_col_to_utf16_col(0, 0).unwrap(), 1);
+        // byte 4 ('b' after 日 which is 3 UTF-8 bytes / 1 UTF-16 unit) → 2 UTF-16 units before, +1 = 3
+        assert_eq!(idx.byte_col_to_utf16_col(0, 4).unwrap(), 3);
+    }
 }
@@ -7,6 +7,7 @@ use crate::analysis::ddsa_lib::common::{Class, DDSAJsRuntimeError, NodeId, Stell
 use crate::analysis::ddsa_lib::js;
 use crate::analysis::ddsa_lib::v8_ds::MirroredVec;
 use crate::analysis::tree_sitter::QueryMatch;
+use common::utils::position_utils::LineColumnIndex;
 use deno_core::v8;
 use deno_core::v8::HandleScope;
 
@@ -40,6 +41,7 @@ impl QueryMatchBridge {
         scope: &mut HandleScope,
         matches: impl Into<Vec<QueryMatch<tree_sitter::Node<'tree>>>>,
         node_bridge: &mut TsNodeBridge,
+        idx: &LineColumnIndex,
     ) {
         let matches = matches.into();
         // Pass each node in via the bridge (assigning it an id), and use this id to transform
@@ -49,7 +51,7 @@ impl QueryMatchBridge {
             .map(|q_match| {
                 q_match
                     .into_iter()
-                    .map(|capture| node_bridge.insert_capture(scope, capture))
+                    .map(|capture| node_bridge.insert_capture(scope, capture, idx))
                     .collect::<Vec<_>>()
                     .into()
             })
@@ -89,6 +91,7 @@ mod tests {
     use crate::analysis::ddsa_lib::v8_ds::MirroredVec;
     use crate::analysis::tree_sitter::{get_tree, QueryMatch, TSCaptureContent, TSQuery};
     use crate::model::common::Language;
+    use common::utils::position_utils::LineColumnIndex;
     use deno_core::JsRuntime;
 
     fn setup_bridge() -> (JsRuntime, QueryMatchBridge, TsNodeBridge) {
@@ -125,9 +128,10 @@ const ghi = 'hello' + ' world';
 ";
         let query = TSQuery::try_new(&tree.language(), query).unwrap();
         let matches = query.cursor().matches(tree.root_node(), text, None);
+        let idx = LineColumnIndex::new(text);
         assert!(query_match_bridge.is_empty());
         assert!(ts_node_bridge.is_empty());
-        query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge);
+        query_match_bridge.set_data(scope, matches.clone(), &mut ts_node_bridge, &idx);
         assert_eq!(query_match_bridge.len(), 3);
         assert_eq!(ts_node_bridge.len(), 3);
 
@@ -139,16 +143,17 @@ const ghi = 'hello' + ' world';
         }
 
         // The `QueryMatchBridge` doesn't clear nodes from `TsNodeBridge` when values change.
-        query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge);
+        query_match_bridge.set_data(scope, &matches[0..2], &mut ts_node_bridge, &idx);
         assert_eq!(query_match_bridge.len(), 2);
         assert_eq!(ts_node_bridge.len(), 3);
-        let text = "\
+        let text2 = "\
 // Arbitrary JavaScript that contains `identifier` CST nodes
 const alpha = 'bravo';
 ";
-        let tree = get_tree(text, &Language::JavaScript).unwrap();
-        let matches = query.cursor().matches(tree.root_node(), text, None);
-        query_match_bridge.set_data(scope, matches, &mut ts_node_bridge);
+        let tree2 = get_tree(text2, &Language::JavaScript).unwrap();
+        let matches2 = query.cursor().matches(tree2.root_node(), text2, None);
+        let idx2 = LineColumnIndex::new(text2);
+        query_match_bridge.set_data(scope, matches2, &mut ts_node_bridge, &idx2);
         assert_eq!(get_node_id_at_idx(&query_match_bridge, 0), 3);
         assert_eq!(ts_node_bridge.len(), 4);
     }