PerryTS · proggeramlug · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs
@@ -372,6 +372,58 @@ fn fold_surrogate_pairs(pattern: &str) -> String {
     out
 }
 
+/// Parse a `\p{...}` / `\P{...}` Unicode property escape starting at `chars[i]`
+/// (which must be the backslash, with `chars[i+1]` a `p`/`P` and `chars[i+2]` a
+/// `{`). Returns `(property_value, negated, end)` where `end` is the index just
+/// past the closing `}` and `property_value` is the lowercased value with any
+/// `gc=` / `general_category=` prefix stripped and `_`/spaces removed (loose
+/// matching). Returns `None` if the brace form is malformed.
+fn parse_unicode_property(chars: &[char], i: usize) -> Option<(String, bool, usize)> {
+    let negated = match chars.get(i + 1) {
+        Some('p') => false,
+        Some('P') => true,
+        _ => return None,
+    };
+    if chars.get(i + 2) != Some(&'{') {
+        return None;
+    }
+    let mut k = i + 3;
+    let mut body = String::new();
+    while let Some(&c) = chars.get(k) {
+        if c == '}' {
+            break;
+        }
+        body.push(c);
+        k += 1;
+    }
+    if chars.get(k) != Some(&'}') {
+        return None;
+    }
+    let lower = body.to_ascii_lowercase();
+    // Strip a `gc=` / `general_category=` prefix; leave other `key=value`
+    // properties (e.g. `script=greek`) intact so they pass through unchanged.
+    let value = match lower.split_once('=') {
+        Some((key, val))
+            if matches!(
+                key.trim().replace(['_', ' '], "").as_str(),
+                "gc" | "generalcategory"
+            ) =>
+        {
+            val.to_string()
+        }
+        _ => lower,
+    };
+    Some((value.trim().replace(['_', ' '], ""), negated, k + 1))
+}
+
+/// `\p{Surrogate}` / `\p{gc=Cs}` — the only general category consisting entirely
+/// of UTF-16 surrogate code points (U+D800..=U+DFFF). The Rust `regex` crate
+/// matches over Unicode *scalar values*, which exclude surrogates, so it rejects
+/// this property outright instead of treating it as never-matching.
+fn is_surrogate_property(value: &str) -> bool {
+    value == "surrogate" || value == "cs"
+}
+
 /// Translate a JavaScript regex pattern to a Rust regex-crate compatible pattern.
 /// Handles JS-specific escape sequences not supported by the Rust regex crate.
 /// Also converts JS-style named groups `(?<name>...)` to Rust-style `(?P<name>...)`.
@@ -419,6 +471,38 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String {
                         i += 1 + digits;
                     }
                 }
+                'p' | 'P' if chars.get(i + 2) == Some(&'{') => {
+                    // `\p{Surrogate}` / `\p{gc=Cs}` (and the `\P{...}` negation)
+                    // name the UTF-16 surrogate code points, which can't occur in
+                    // the Unicode *scalar values* the Rust `regex` crate matches
+                    // over — so the crate rejects them as `invalid pattern`. Treat
+                    // the positive form as a never-matching class and the negated
+                    // form as "any scalar value". `string-width@7+` builds two
+                    // module-top-level regexes that include `\p{Surrogate}`, so
+                    // without this rewrite importing it (→ ink) throws at init.
+                    // All other properties pass through to the crate unchanged.
+                    match parse_unicode_property(&chars, i) {
+                        Some((value, negated, end)) if is_surrogate_property(&value) => {
+                            if in_class {
+                                // A never-matching member contributes nothing to a
+                                // class union; the negation matches every scalar.
+                                if negated {
+                                    result.push_str("\\s\\S");
+                                }
+                            } else if negated {
+                                result.push_str("[\\s\\S]");
+                            } else {
+                                result.push_str("[^\\s\\S]");
+                            }
+                            i = end;
+                        }
+                        _ => {
+                            result.push('\\');
+                            result.push(chars[i + 1]);
+                            i += 2;
+                        }
+                    }
+                }
                 ch if is_regex_identity_escape(ch) => {
                     // Inside a character class an escaped hyphen `\-` is always a
                     // literal hyphen, but the Rust `regex` crate reads a bare `-`
@@ -497,3 +581,49 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String {
     }
     result
 }
+
+#[cfg(test)]
+mod tests {
+    use super::js_regex_to_rust;
+
+    #[test]
+    fn surrogate_property_rewrites_to_never_match() {
+        // #4884: the Rust `regex` crate matches Unicode scalar values, which
+        // exclude surrogate code points, so it rejects `\p{Surrogate}` outright.
+        // The positive form is rewritten to a never-matching class and the
+        // negation to "any scalar value".
+        assert_eq!(js_regex_to_rust(r"\p{Surrogate}"), r"[^\s\S]");
+        assert_eq!(js_regex_to_rust(r"\P{Surrogate}"), r"[\s\S]");
+        // The `gc=Cs` / `General_Category=Surrogate` spellings normalize the same.
+        assert_eq!(js_regex_to_rust(r"\p{gc=Cs}"), r"[^\s\S]");
+        assert_eq!(
+            js_regex_to_rust(r"\p{General_Category=Surrogate}"),
+            r"[^\s\S]"
+        );
+        // Inside a class the positive form drops (a never-matching member adds
+        // nothing to the union); the negation contributes "any scalar value".
+        assert_eq!(
+            js_regex_to_rust(r"[\p{Control}\p{Surrogate}]"),
+            r"[\p{Control}]"
+        );
+        assert_eq!(js_regex_to_rust(r"[\P{Surrogate}]"), r"[\s\S]");
+        // Every other property passes through to the crate unchanged.
+        assert_eq!(js_regex_to_rust(r"\p{Control}"), r"\p{Control}");
+        assert_eq!(js_regex_to_rust(r"\p{Script=Greek}"), r"\p{Script=Greek}");
+        assert_eq!(js_regex_to_rust(r"\pL"), r"\pL");
+
+        // The two `string-width@7+` module-top-level regexes (→ ink, #348) that
+        // threw `SyntaxError: invalid pattern` at import must now compile under
+        // the Rust `regex` crate.
+        for pat in [
+            r"^(?:\p{Default_Ignorable_Code_Point}|\p{Control}|\p{Format}|\p{Mark}|\p{Surrogate})+$",
+            r"^[\p{Default_Ignorable_Code_Point}\p{Control}\p{Format}\p{Mark}\p{Surrogate}]+",
+        ] {
+            let translated = js_regex_to_rust(pat);
+            assert!(
+                regex::Regex::new(&translated).is_ok(),
+                "string-width pattern failed to compile: {pat} -> {translated}"
+            );
+        }
+    }
+}