From 35f6c2cb1a35c2a78dd742450e02ab7613e29d2e Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 27 Apr 2026 15:21:53 +0200 Subject: [PATCH] ASCII identifier fast path --- crates/ruff_python_parser/src/lexer.rs | 62 +++++++++++++------ ...ests__unicode_identifier_continuation.snap | 30 +++++++++ 2 files changed, 72 insertions(+), 20 deletions(-) create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__unicode_identifier_continuation.snap diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 8eb9f672d01aec..28a8869b43cc3f 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -692,9 +692,7 @@ impl<'src> Lexer<'src> { // We need to therefore do the same in our lexer, but applying NFKC normalization // unconditionally is extremely expensive. If we know an identifier is ASCII-only, // (by far the most common case), we can skip NFKC normalization of the identifier. - let mut is_ascii = first.is_ascii(); - self.cursor - .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); + let is_ascii = self.eat_identifier_continuation(first.is_ascii()); let text = self.token_text(); @@ -758,6 +756,37 @@ impl<'src> Lexer<'src> { } } + fn eat_identifier_continuation(&mut self, is_ascii: bool) -> bool { + self.eat_ascii_identifier_continuation(); + + if self.cursor.first().is_ascii() { + is_ascii + } else { + self.eat_unicode_identifier_continuation(); + false + } + } + + fn eat_ascii_identifier_continuation(&mut self) { + let bytes = self.cursor.rest().as_bytes(); + let ascii_len = bytes + .iter() + .take_while(|&&byte| is_ascii_identifier_continuation_byte(byte)) + .count(); + + if ascii_len > 0 { + self.cursor.skip_bytes(ascii_len); + } + } + + #[cold] + fn eat_unicode_identifier_continuation(&mut self) { + while is_xid_continue(self.cursor.first()) { + self.cursor.bump(); + self.eat_ascii_identifier_continuation(); + } + } + /// Try lexing the single character string prefix, updating the token flags accordingly. /// Returns `true` if it matches. fn try_single_char_prefix(&mut self, first: char) -> bool { @@ -1834,29 +1863,16 @@ const fn is_ascii_identifier_start(c: char) -> bool { matches!(c, 'a'..='z' | 'A'..='Z' | '_') } +const fn is_ascii_identifier_continuation_byte(byte: u8) -> bool { + matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9') +} + // Checks if the character c is a valid starting character as described // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } -/// Checks if the character c is a valid continuation character as described -/// in . -/// -/// Additionally, this function also keeps track of whether or not the total -/// identifier is ASCII-only or not by mutably altering a reference to a -/// boolean value passed in. -fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool { - // Arrange things such that ASCII codepoints never - // result in the slower `is_xid_continue` getting called. - if c.is_ascii() { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') - } else { - *identifier_is_ascii_only = false; - is_xid_continue(c) - } -} - enum LexedText<'a> { Source { source: &'a str, range: TextRange }, Owned(String), @@ -2430,6 +2446,12 @@ if first: assert_eq!(get_tokens_only(source1), get_tokens_only(source2)); } + #[test] + fn test_unicode_identifier_continuation() { + let source = "a𝒞 = 500"; + assert_snapshot!(lex_source(source)); + } + fn triple_quoted_eol(eol: &str) -> LexerOutput { let source = format!("\"\"\"{eol} test string{eol} \"\"\""); lex_source(&source) diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__unicode_identifier_continuation.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__unicode_identifier_continuation.snap new file mode 100644 index 00000000000000..7c8d057e391edb --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__unicode_identifier_continuation.snap @@ -0,0 +1,30 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +assertion_line: 2434 +expression: lex_source(source) +--- +## Tokens +``` +[ + ( + Name( + Name("aC"), + ), + 0..5, + ), + ( + Equal, + 6..7, + ), + ( + Int( + 500, + ), + 8..11, + ), + ( + Newline, + 11..11, + ), +] +```