From 2a03f5846d1e812b38cca2246f0e0f9616a38e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Wed, 10 Jun 2026 10:08:49 +0200 Subject: [PATCH 1/2] fix(regex): rewrite \p{Surrogate} to a never-matching class (#4884) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Rust `regex` crate matches over Unicode scalar values, which exclude surrogate code points (U+D800..=U+DFFF), so it rejects `\p{Surrogate}` / `\p{gc=Cs}` outright with `invalid pattern` instead of treating it as a never-matching class. Every other property in the family compiles fine. `string-width@7+` builds two module-top-level regexes that include `\p{Surrogate}`, so importing it (→ wrap-ansi / cli-truncate / slice-ansi → ink, #348) threw `SyntaxError` at module init before any user code ran — the next ink wall after #4877 (Intl.Segmenter). `js_regex_to_rust` now intercepts the `\p{...}` / `\P{...}` brace form, loosely normalizes the property value (stripping `gc=` / `general_category=` and `_`/spaces), and for the surrogate category rewrites: - positive, outside a class → `[^\s\S]` (never matches) - negated, outside a class → `[\s\S]` (any scalar value) - positive, inside a class → dropped (a never-matching union member) - negated, inside a class → `\s\S` All other properties pass through to the crate unchanged. Since valid input carries no surrogate scalar values, this is behavior-preserving and matches Node byte-for-byte on the string-width width/zero-width predicates. Unit test in regex.rs covers the rewrite spellings and constructs both string-width patterns. --- crates/perry-runtime/src/regex.rs | 40 +++++++++++ crates/perry-runtime/src/regex/grammar.rs | 84 +++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/crates/perry-runtime/src/regex.rs b/crates/perry-runtime/src/regex.rs index 1f9ef4b658..14d79d2769 100644 --- a/crates/perry-runtime/src/regex.rs +++ b/crates/perry-runtime/src/regex.rs @@ -1964,4 +1964,44 @@ mod tests { let re = js_regexp_new(make_string(pat), flags); assert!(!re.is_null(), "ID_Start-shaped pattern failed to construct"); } + + #[test] + fn surrogate_property_rewrites_to_never_match() { + // #4884: the Rust `regex` crate matches Unicode scalar values, which + // exclude surrogate code points, so it rejects `\p{Surrogate}` outright. + // The positive form is rewritten to a never-matching class and the + // negation to "any scalar value". + assert_eq!(js_regex_to_rust(r"\p{Surrogate}"), r"[^\s\S]"); + assert_eq!(js_regex_to_rust(r"\P{Surrogate}"), r"[\s\S]"); + // The `gc=Cs` / `General_Category=Surrogate` spellings normalize the same. + assert_eq!(js_regex_to_rust(r"\p{gc=Cs}"), r"[^\s\S]"); + assert_eq!( + js_regex_to_rust(r"\p{General_Category=Surrogate}"), + r"[^\s\S]" + ); + // Inside a class the positive form drops (a never-matching member adds + // nothing to the union); the negation contributes "any scalar value". + assert_eq!( + js_regex_to_rust(r"[\p{Control}\p{Surrogate}]"), + r"[\p{Control}]" + ); + assert_eq!(js_regex_to_rust(r"[\P{Surrogate}]"), r"[\s\S]"); + // Every other property passes through to the crate unchanged. + assert_eq!(js_regex_to_rust(r"\p{Control}"), r"\p{Control}"); + assert_eq!(js_regex_to_rust(r"\p{Script=Greek}"), r"\p{Script=Greek}"); + assert_eq!(js_regex_to_rust(r"\pL"), r"\pL"); + + // The two `string-width@7+` module-top-level regexes (→ ink, #348) that + // threw `SyntaxError: invalid pattern` at import must now construct. + for pat in [ + r"^(?:\p{Default_Ignorable_Code_Point}|\p{Control}|\p{Format}|\p{Mark}|\p{Surrogate})+$", + r"^[\p{Default_Ignorable_Code_Point}\p{Control}\p{Format}\p{Mark}\p{Surrogate}]+", + ] { + let re = js_regexp_new(make_string(pat), make_string("v")); + assert!( + !re.is_null(), + "string-width pattern failed to construct: {pat}" + ); + } + } } diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs index feb9b3d9b9..24d5bfa749 100644 --- a/crates/perry-runtime/src/regex/grammar.rs +++ b/crates/perry-runtime/src/regex/grammar.rs @@ -372,6 +372,58 @@ fn fold_surrogate_pairs(pattern: &str) -> String { out } +/// Parse a `\p{...}` / `\P{...}` Unicode property escape starting at `chars[i]` +/// (which must be the backslash, with `chars[i+1]` a `p`/`P` and `chars[i+2]` a +/// `{`). Returns `(property_value, negated, end)` where `end` is the index just +/// past the closing `}` and `property_value` is the lowercased value with any +/// `gc=` / `general_category=` prefix stripped and `_`/spaces removed (loose +/// matching). Returns `None` if the brace form is malformed. +fn parse_unicode_property(chars: &[char], i: usize) -> Option<(String, bool, usize)> { + let negated = match chars.get(i + 1) { + Some('p') => false, + Some('P') => true, + _ => return None, + }; + if chars.get(i + 2) != Some(&'{') { + return None; + } + let mut k = i + 3; + let mut body = String::new(); + while let Some(&c) = chars.get(k) { + if c == '}' { + break; + } + body.push(c); + k += 1; + } + if chars.get(k) != Some(&'}') { + return None; + } + let lower = body.to_ascii_lowercase(); + // Strip a `gc=` / `general_category=` prefix; leave other `key=value` + // properties (e.g. `script=greek`) intact so they pass through unchanged. + let value = match lower.split_once('=') { + Some((key, val)) + if matches!( + key.trim().replace(['_', ' '], "").as_str(), + "gc" | "generalcategory" + ) => + { + val.to_string() + } + _ => lower, + }; + Some((value.trim().replace(['_', ' '], ""), negated, k + 1)) +} + +/// `\p{Surrogate}` / `\p{gc=Cs}` — the only general category consisting entirely +/// of UTF-16 surrogate code points (U+D800..=U+DFFF). The Rust `regex` crate +/// matches over Unicode *scalar values*, which exclude surrogates, so it rejects +/// this property outright instead of treating it as never-matching. +fn is_surrogate_property(value: &str) -> bool { + value == "surrogate" || value == "cs" +} + /// Translate a JavaScript regex pattern to a Rust regex-crate compatible pattern. /// Handles JS-specific escape sequences not supported by the Rust regex crate. /// Also converts JS-style named groups `(?...)` to Rust-style `(?P...)`. @@ -419,6 +471,38 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String { i += 1 + digits; } } + 'p' | 'P' if chars.get(i + 2) == Some(&'{') => { + // `\p{Surrogate}` / `\p{gc=Cs}` (and the `\P{...}` negation) + // name the UTF-16 surrogate code points, which can't occur in + // the Unicode *scalar values* the Rust `regex` crate matches + // over — so the crate rejects them as `invalid pattern`. Treat + // the positive form as a never-matching class and the negated + // form as "any scalar value". `string-width@7+` builds two + // module-top-level regexes that include `\p{Surrogate}`, so + // without this rewrite importing it (→ ink) throws at init. + // All other properties pass through to the crate unchanged. + match parse_unicode_property(&chars, i) { + Some((value, negated, end)) if is_surrogate_property(&value) => { + if in_class { + // A never-matching member contributes nothing to a + // class union; the negation matches every scalar. + if negated { + result.push_str("\\s\\S"); + } + } else if negated { + result.push_str("[\\s\\S]"); + } else { + result.push_str("[^\\s\\S]"); + } + i = end; + } + _ => { + result.push('\\'); + result.push(chars[i + 1]); + i += 2; + } + } + } ch if is_regex_identity_escape(ch) => { // Inside a character class an escaped hyphen `\-` is always a // literal hyphen, but the Rust `regex` crate reads a bare `-` From 06490d3ef6213abc48c9233c238829151b98495b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Wed, 10 Jun 2026 10:23:16 +0200 Subject: [PATCH 2/2] test(regex): move #4884 surrogate test to grammar.rs (file-size gate) regex.rs hit the 2000-line lint gate (2007). Relocate the new surrogate-property test into grammar.rs alongside the js_regex_to_rust logic it exercises, validating compilation via regex::Regex::new directly instead of js_regexp_new. regex.rs back to 1967 lines. --- crates/perry-runtime/src/regex.rs | 40 -------------------- crates/perry-runtime/src/regex/grammar.rs | 46 +++++++++++++++++++++++ 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/crates/perry-runtime/src/regex.rs b/crates/perry-runtime/src/regex.rs index 14d79d2769..1f9ef4b658 100644 --- a/crates/perry-runtime/src/regex.rs +++ b/crates/perry-runtime/src/regex.rs @@ -1964,44 +1964,4 @@ mod tests { let re = js_regexp_new(make_string(pat), flags); assert!(!re.is_null(), "ID_Start-shaped pattern failed to construct"); } - - #[test] - fn surrogate_property_rewrites_to_never_match() { - // #4884: the Rust `regex` crate matches Unicode scalar values, which - // exclude surrogate code points, so it rejects `\p{Surrogate}` outright. - // The positive form is rewritten to a never-matching class and the - // negation to "any scalar value". - assert_eq!(js_regex_to_rust(r"\p{Surrogate}"), r"[^\s\S]"); - assert_eq!(js_regex_to_rust(r"\P{Surrogate}"), r"[\s\S]"); - // The `gc=Cs` / `General_Category=Surrogate` spellings normalize the same. - assert_eq!(js_regex_to_rust(r"\p{gc=Cs}"), r"[^\s\S]"); - assert_eq!( - js_regex_to_rust(r"\p{General_Category=Surrogate}"), - r"[^\s\S]" - ); - // Inside a class the positive form drops (a never-matching member adds - // nothing to the union); the negation contributes "any scalar value". - assert_eq!( - js_regex_to_rust(r"[\p{Control}\p{Surrogate}]"), - r"[\p{Control}]" - ); - assert_eq!(js_regex_to_rust(r"[\P{Surrogate}]"), r"[\s\S]"); - // Every other property passes through to the crate unchanged. - assert_eq!(js_regex_to_rust(r"\p{Control}"), r"\p{Control}"); - assert_eq!(js_regex_to_rust(r"\p{Script=Greek}"), r"\p{Script=Greek}"); - assert_eq!(js_regex_to_rust(r"\pL"), r"\pL"); - - // The two `string-width@7+` module-top-level regexes (→ ink, #348) that - // threw `SyntaxError: invalid pattern` at import must now construct. - for pat in [ - r"^(?:\p{Default_Ignorable_Code_Point}|\p{Control}|\p{Format}|\p{Mark}|\p{Surrogate})+$", - r"^[\p{Default_Ignorable_Code_Point}\p{Control}\p{Format}\p{Mark}\p{Surrogate}]+", - ] { - let re = js_regexp_new(make_string(pat), make_string("v")); - assert!( - !re.is_null(), - "string-width pattern failed to construct: {pat}" - ); - } - } } diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs index 24d5bfa749..7f9a8f60b0 100644 --- a/crates/perry-runtime/src/regex/grammar.rs +++ b/crates/perry-runtime/src/regex/grammar.rs @@ -581,3 +581,49 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String { } result } + +#[cfg(test)] +mod tests { + use super::js_regex_to_rust; + + #[test] + fn surrogate_property_rewrites_to_never_match() { + // #4884: the Rust `regex` crate matches Unicode scalar values, which + // exclude surrogate code points, so it rejects `\p{Surrogate}` outright. + // The positive form is rewritten to a never-matching class and the + // negation to "any scalar value". + assert_eq!(js_regex_to_rust(r"\p{Surrogate}"), r"[^\s\S]"); + assert_eq!(js_regex_to_rust(r"\P{Surrogate}"), r"[\s\S]"); + // The `gc=Cs` / `General_Category=Surrogate` spellings normalize the same. + assert_eq!(js_regex_to_rust(r"\p{gc=Cs}"), r"[^\s\S]"); + assert_eq!( + js_regex_to_rust(r"\p{General_Category=Surrogate}"), + r"[^\s\S]" + ); + // Inside a class the positive form drops (a never-matching member adds + // nothing to the union); the negation contributes "any scalar value". + assert_eq!( + js_regex_to_rust(r"[\p{Control}\p{Surrogate}]"), + r"[\p{Control}]" + ); + assert_eq!(js_regex_to_rust(r"[\P{Surrogate}]"), r"[\s\S]"); + // Every other property passes through to the crate unchanged. + assert_eq!(js_regex_to_rust(r"\p{Control}"), r"\p{Control}"); + assert_eq!(js_regex_to_rust(r"\p{Script=Greek}"), r"\p{Script=Greek}"); + assert_eq!(js_regex_to_rust(r"\pL"), r"\pL"); + + // The two `string-width@7+` module-top-level regexes (→ ink, #348) that + // threw `SyntaxError: invalid pattern` at import must now compile under + // the Rust `regex` crate. + for pat in [ + r"^(?:\p{Default_Ignorable_Code_Point}|\p{Control}|\p{Format}|\p{Mark}|\p{Surrogate})+$", + r"^[\p{Default_Ignorable_Code_Point}\p{Control}\p{Format}\p{Mark}\p{Surrogate}]+", + ] { + let translated = js_regex_to_rust(pat); + assert!( + regex::Regex::new(&translated).is_ok(), + "string-width pattern failed to compile: {pat} -> {translated}" + ); + } + } +}