Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions crates/perry-runtime/src/regex/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,58 @@ fn fold_surrogate_pairs(pattern: &str) -> String {
out
}

/// Parse a `\p{...}` / `\P{...}` Unicode property escape starting at `chars[i]`
/// (which must be the backslash, with `chars[i+1]` a `p`/`P` and `chars[i+2]` a
/// `{`). Returns `(property_value, negated, end)` where `end` is the index just
/// past the closing `}` and `property_value` is the lowercased value with any
/// `gc=` / `general_category=` prefix stripped and `_`/spaces removed (loose
/// matching). Returns `None` if the brace form is malformed.
fn parse_unicode_property(chars: &[char], i: usize) -> Option<(String, bool, usize)> {
let negated = match chars.get(i + 1) {
Some('p') => false,
Some('P') => true,
_ => return None,
};
if chars.get(i + 2) != Some(&'{') {
return None;
}
let mut k = i + 3;
let mut body = String::new();
while let Some(&c) = chars.get(k) {
if c == '}' {
break;
}
body.push(c);
k += 1;
}
if chars.get(k) != Some(&'}') {
return None;
}
let lower = body.to_ascii_lowercase();
// Strip a `gc=` / `general_category=` prefix; leave other `key=value`
// properties (e.g. `script=greek`) intact so they pass through unchanged.
let value = match lower.split_once('=') {
Some((key, val))
if matches!(
key.trim().replace(['_', ' '], "").as_str(),
"gc" | "generalcategory"
) =>
{
val.to_string()
}
_ => lower,
};
Some((value.trim().replace(['_', ' '], ""), negated, k + 1))
}

/// `\p{Surrogate}` / `\p{gc=Cs}` — the only general category consisting entirely
/// of UTF-16 surrogate code points (U+D800..=U+DFFF). The Rust `regex` crate
/// matches over Unicode *scalar values*, which exclude surrogates, so it rejects
/// this property outright instead of treating it as never-matching.
fn is_surrogate_property(value: &str) -> bool {
value == "surrogate" || value == "cs"
}

/// Translate a JavaScript regex pattern to a Rust regex-crate compatible pattern.
/// Handles JS-specific escape sequences not supported by the Rust regex crate.
/// Also converts JS-style named groups `(?<name>...)` to Rust-style `(?P<name>...)`.
Expand Down Expand Up @@ -419,6 +471,38 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String {
i += 1 + digits;
}
}
'p' | 'P' if chars.get(i + 2) == Some(&'{') => {
// `\p{Surrogate}` / `\p{gc=Cs}` (and the `\P{...}` negation)
// name the UTF-16 surrogate code points, which can't occur in
// the Unicode *scalar values* the Rust `regex` crate matches
// over — so the crate rejects them as `invalid pattern`. Treat
// the positive form as a never-matching class and the negated
// form as "any scalar value". `string-width@7+` builds two
// module-top-level regexes that include `\p{Surrogate}`, so
// without this rewrite importing it (→ ink) throws at init.
// All other properties pass through to the crate unchanged.
match parse_unicode_property(&chars, i) {
Some((value, negated, end)) if is_surrogate_property(&value) => {
if in_class {
// A never-matching member contributes nothing to a
// class union; the negation matches every scalar.
if negated {
result.push_str("\\s\\S");
}
} else if negated {
result.push_str("[\\s\\S]");
} else {
result.push_str("[^\\s\\S]");
}
i = end;
}
_ => {
result.push('\\');
result.push(chars[i + 1]);
i += 2;
}
}
}
ch if is_regex_identity_escape(ch) => {
// Inside a character class an escaped hyphen `\-` is always a
// literal hyphen, but the Rust `regex` crate reads a bare `-`
Expand Down Expand Up @@ -497,3 +581,49 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String {
}
result
}

#[cfg(test)]
mod tests {
use super::js_regex_to_rust;

#[test]
fn surrogate_property_rewrites_to_never_match() {
// #4884: the Rust `regex` crate matches Unicode scalar values, which
// exclude surrogate code points, so it rejects `\p{Surrogate}` outright.
// The positive form is rewritten to a never-matching class and the
// negation to "any scalar value".
assert_eq!(js_regex_to_rust(r"\p{Surrogate}"), r"[^\s\S]");
assert_eq!(js_regex_to_rust(r"\P{Surrogate}"), r"[\s\S]");
// The `gc=Cs` / `General_Category=Surrogate` spellings normalize the same.
assert_eq!(js_regex_to_rust(r"\p{gc=Cs}"), r"[^\s\S]");
assert_eq!(
js_regex_to_rust(r"\p{General_Category=Surrogate}"),
r"[^\s\S]"
);
// Inside a class the positive form drops (a never-matching member adds
// nothing to the union); the negation contributes "any scalar value".
assert_eq!(
js_regex_to_rust(r"[\p{Control}\p{Surrogate}]"),
r"[\p{Control}]"
);
assert_eq!(js_regex_to_rust(r"[\P{Surrogate}]"), r"[\s\S]");
// Every other property passes through to the crate unchanged.
assert_eq!(js_regex_to_rust(r"\p{Control}"), r"\p{Control}");
assert_eq!(js_regex_to_rust(r"\p{Script=Greek}"), r"\p{Script=Greek}");
assert_eq!(js_regex_to_rust(r"\pL"), r"\pL");

// The two `string-width@7+` module-top-level regexes (→ ink, #348) that
// threw `SyntaxError: invalid pattern` at import must now compile under
// the Rust `regex` crate.
for pat in [
r"^(?:\p{Default_Ignorable_Code_Point}|\p{Control}|\p{Format}|\p{Mark}|\p{Surrogate})+$",
r"^[\p{Default_Ignorable_Code_Point}\p{Control}\p{Format}\p{Mark}\p{Surrogate}]+",
] {
let translated = js_regex_to_rust(pat);
assert!(
regex::Regex::new(&translated).is_ok(),
"string-width pattern failed to compile: {pat} -> {translated}"
);
}
}
}
Loading