From decffcb13528dd31f705f551b0177dee4829e1e1 Mon Sep 17 00:00:00 2001 From: Victor Hallberg Date: Thu, 18 Jun 2026 12:43:04 +0200 Subject: [PATCH 1/2] fix: make directive scanner handle quotes in JSX correctly --- .../src/comment_directive/source_scanner.rs | 389 ++++++++++++++++-- crates/lingui_macro/tests/lingui_directive.rs | 23 ++ ...ctive__jsx_trans_with_unclosed_quotes.snap | 36 ++ 3 files changed, 407 insertions(+), 41 deletions(-) create mode 100644 crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap diff --git a/crates/lingui_macro/src/comment_directive/source_scanner.rs b/crates/lingui_macro/src/comment_directive/source_scanner.rs index 94bd03f..9bafa32 100644 --- a/crates/lingui_macro/src/comment_directive/source_scanner.rs +++ b/crates/lingui_macro/src/comment_directive/source_scanner.rs @@ -9,66 +9,302 @@ pub enum CommentKind { Block, } +/// Lexer context. JSX must be tracked because a quote or backtick inside JSX +/// text (e.g. `

'

` or ``

`

``) is literal content, not a string or +/// template literal. Treating it as a literal opener would scan ahead to the +/// next matching delimiter and swallow any comments — including lingui +/// directives — in between. +enum Ctx { + /// Ordinary JS/TS code. `brace_depth` counts nested `{`; when this frame is + /// a JSX expression container (`jsx_expr`), the `}` at depth 0 returns to + /// the enclosing JSX children. + Code { brace_depth: u32, jsx_expr: bool }, + /// Inside a JSX tag, between `<` and its closing `>` / `/>`. + JsxTag, + /// Inside JSX element children (text). Quotes, backticks and comment + /// markers are all literal text here. + JsxChildren, +} + +/// Extracts every line and block comment from `source`, in source order, with +/// byte offsets relative to the start of `source`. +/// +/// # Why this is a hand-rolled scanner +/// +/// Comments are needed to locate `lingui-set` / `lingui-reset` directives and +/// associate each with the byte position it precedes. SWC has of course already +/// parsed the file and knows every comment — but the `Comments` trait (and the +/// `PluginCommentsProxy` the plugin actually receives in the WASM host) only +/// supports *lookup by position*, not *enumeration*. There is no API to ask +/// "give me all comments in order", so we cannot reuse SWC's comment table here +/// and must recover the comments from the raw source ourselves. +/// +/// # Why it can't be a naive `//` / `/*` search +/// +/// Comment markers also appear inside string literals (`"https://…"`), template +/// literals, and regular text — so the scanner has to understand enough lexical +/// structure to skip those. The tricky part is JSX: a quote or backtick in JSX +/// *text* (`

'

`, ``

`

``) is literal content, not a string/template +/// opener. A flat scanner that skips every `'`/`"`/`` ` `` as a literal would, +/// on hitting such a character, scan ahead to the next matching delimiter and +/// swallow any comments — including directives — in between. That is exactly the +/// bug this scanner exists to avoid, so it tracks JSX context (see [`Ctx`]). +/// +/// # Limitations +/// +/// This is a pragmatic lexer, not a full parser. It does not handle regular +/// expression literals (a `/regex/` is treated as division, which at worst +/// misreads its contents but never the surrounding directives), and JSX +/// detection is heuristic (see [`is_jsx_tag_start`]). These trade-offs are +/// acceptable because the only consumers are lingui directive comments, and a +/// misclassification can at most cause a directive to be missed — never applied +/// to the wrong code. pub fn scan_source_comments(source: &str) -> Vec> { let bytes = source.as_bytes(); let mut comments = Vec::new(); let mut index = 0usize; + let mut stack: Vec = vec![Ctx::Code { + brace_depth: 0, + jsx_expr: false, + }]; + // Last significant (non-whitespace, non-comment) code byte. Used to decide + // whether a `<` opens a JSX element (expression position) or is a + // comparison / TS generic (after a value). + let mut last_sig = 0u8; while index < bytes.len() { - match bytes[index] { - b'\'' | b'"' => { - index = skip_string_literal(bytes, index); - } - b'`' => { - index = skip_template_literal(bytes, index); - } - b'/' if bytes.get(index + 1) == Some(&b'/') => { - let comment_start = index; - let content_start = index + 2; - index = content_start; - - while index < bytes.len() && bytes[index] != b'\n' { - index += 1; + match stack.last_mut().expect("scanner stack is never empty") { + Ctx::Code { + brace_depth, + jsx_expr, + } => match bytes[index] { + b'\'' | b'"' => { + index = skip_string_literal(bytes, index); + last_sig = b'"'; } + b'`' => { + index = skip_template_literal(bytes, index); + last_sig = b'`'; + } + b'/' if bytes.get(index + 1) == Some(&b'/') => { + let comment_start = index; + let content_start = index + 2; + index = content_start; - comments.push(SourceComment { - byte_offset: comment_start, - content: &source[content_start..index], - kind: CommentKind::Line, - }); - } - b'/' if bytes.get(index + 1) == Some(&b'*') => { - let comment_start = index; - let content_start = index + 2; - index = content_start; - - while index < bytes.len() { - if bytes[index] == b'*' && bytes.get(index + 1) == Some(&b'/') { - break; + while index < bytes.len() && bytes[index] != b'\n' { + index += 1; } - index += 1; + + comments.push(SourceComment { + byte_offset: comment_start, + content: &source[content_start..index], + kind: CommentKind::Line, + }); } + b'/' if bytes.get(index + 1) == Some(&b'*') => { + let comment_start = index; + let content_start = index + 2; + index = content_start; + + while index < bytes.len() { + if bytes[index] == b'*' && bytes.get(index + 1) == Some(&b'/') { + break; + } + index += 1; + } + + let content_end = index; + if index < bytes.len() { + index += 2; + } - let content_end = index; - if index < bytes.len() { + comments.push(SourceComment { + byte_offset: comment_start, + content: &source[content_start..content_end], + kind: CommentKind::Block, + }); + } + b'{' => { + *brace_depth += 1; + last_sig = b'{'; + index += 1; + } + b'}' => { + if *brace_depth > 0 { + *brace_depth -= 1; + } else if *jsx_expr { + // Closes a JSX expression container; back to children. + stack.pop(); + } + last_sig = b'}'; + index += 1; + } + b'<' if is_jsx_tag_start(bytes, index, last_sig) => { + stack.push(Ctx::JsxTag); + index += 1; + } + b' ' | b'\t' | b'\r' | b'\n' => { + index += 1; + } + other => { + last_sig = other; + index += 1; + } + }, + Ctx::JsxTag => match bytes[index] { + b'\'' | b'"' => { + index = skip_string_literal(bytes, index); + } + b'{' => { + stack.push(Ctx::Code { + brace_depth: 0, + jsx_expr: true, + }); + index += 1; + } + b'/' if bytes.get(index + 1) == Some(&b'>') => { + // Self-closing tag: no children. + stack.pop(); index += 2; + last_sig = b']'; } - - comments.push(SourceComment { - byte_offset: comment_start, - content: &source[content_start..content_end], - kind: CommentKind::Block, - }); - } - _ => { - index += 1; - } + b'>' => { + stack.pop(); + stack.push(Ctx::JsxChildren); + index += 1; + } + b',' | b';' => { + // Not a real JSX tag (e.g. a TS generic such as ``): + // treat the `<` as an operator and resume scanning as code. + stack.pop(); + stack.push(Ctx::Code { + brace_depth: 0, + jsx_expr: false, + }); + } + _ => { + index += 1; + } + }, + Ctx::JsxChildren => match bytes[index] { + b'{' => { + stack.push(Ctx::Code { + brace_depth: 0, + jsx_expr: true, + }); + index += 1; + } + b'<' if bytes.get(index + 1) == Some(&b'/') => { + // Closing tag ``: skip to `>` and leave children. + index += 2; + while index < bytes.len() && bytes[index] != b'>' { + index += 1; + } + if index < bytes.len() { + index += 1; + } + stack.pop(); + last_sig = b']'; + } + b'<' => { + // Nested element open tag. + stack.push(Ctx::JsxTag); + index += 1; + } + _ => { + // Text content — quotes/backticks/`//` are literal here. + index += 1; + } + }, } } comments } +fn is_ident_byte(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' || b == b'$' +} + +/// Keywords after which a `<` begins an expression, so it may open a JSX element +/// (`return
`, `export default `, `yield `, …). In a comparison or +/// generic the byte before `<` is an operand identifier, never one of these, so +/// allowing them does not reintroduce false positives. +fn is_expression_keyword(word: &[u8]) -> bool { + matches!( + word, + b"return" + | b"yield" + | b"await" + | b"void" + | b"typeof" + | b"delete" + | b"throw" + | b"do" + | b"else" + | b"case" + | b"default" + | b"in" + | b"of" + | b"instanceof" + ) +} + +/// A byte that ends a value/expression, after which a `<` is a comparison or TS +/// generic rather than the start of a JSX element. +fn is_value_ending(b: u8) -> bool { + is_ident_byte(b) || matches!(b, b')' | b']' | b'.' | b'"' | b'\'' | b'`') +} + +/// Whether the `<` at `index` (in code context) opens a JSX element. Requires an +/// expression position (per `last_sig`) and a following tag-name char or `>` +/// (fragment). This matches `.tsx` semantics, where a bare `` is JSX and a +/// generic arrow must be written `` (handled by the `,` bail in `JsxTag`). +/// +/// When the preceding byte is identifier-like it is normally a value (so `<` is +/// a comparison/generic), unless that identifier is an expression-starting +/// keyword such as `return` — handled by inspecting the full preceding word. +fn is_jsx_tag_start(bytes: &[u8], index: usize, last_sig: u8) -> bool { + let next_is_tag = match bytes.get(index + 1) { + Some(&b) => b.is_ascii_alphabetic() || b == b'_' || b == b'$' || b == b'>', + None => false, + }; + if !next_is_tag { + return false; + } + + if !is_value_ending(last_sig) { + return true; + } + + // The preceding byte ends a value. Only an identifier ending might actually + // be an expression-starting keyword (e.g. `return`); other value endings + // (`)`, `]`, `.`, quotes) never are. + if !is_ident_byte(last_sig) { + return false; + } + + is_expression_keyword(preceding_word(bytes, index)) +} + +/// The identifier immediately preceding `index`, skipping whitespace. Returns an +/// empty slice if the preceding token is not a bare identifier (e.g. a member +/// access like `obj.return`, which is a property name, not the keyword). +fn preceding_word(bytes: &[u8], index: usize) -> &[u8] { + let mut end = index; + while end > 0 && matches!(bytes[end - 1], b' ' | b'\t' | b'\r' | b'\n') { + end -= 1; + } + let mut start = end; + while start > 0 && is_ident_byte(bytes[start - 1]) { + start -= 1; + } + if start > 0 && bytes[start - 1] == b'.' { + return &[]; + } + &bytes[start..end] +} + fn skip_string_literal(bytes: &[u8], start: usize) -> usize { let delim = bytes[start]; let mut i = start + 1; @@ -77,6 +313,14 @@ fn skip_string_literal(bytes: &[u8], start: usize) -> usize { i = (i + 2).min(bytes.len()); } else if bytes[i] == delim { return i + 1; + } else if bytes[i] == b'\n' { + // A single/double-quoted JS string cannot contain an unescaped + // newline. Hitting one means the opening quote was not a string + // delimiter (e.g. an apostrophe in JSX text like `'`). + // Treat it as a plain character so scanning resumes — otherwise we + // would skip ahead to the next stray quote and swallow any comments + // (including lingui directives) in between. + return start + 1; } else { i += 1; } @@ -312,6 +556,69 @@ mod tests { assert_eq!(line_comments(source), vec![(39, " real")]); } + #[test] + fn quote_in_jsx_text_does_not_swallow_following_comment() { + // The apostrophe is JSX text, not a string opener. A real string can't + // span the newline, so the following comment must still be found. + let source = "const x = ';\n// real"; + assert_eq!(line_comments(source), vec![(28, " real")]); + } + + #[test] + fn quote_in_jsx_text_after_return_keyword_does_not_swallow_comment() { + // `return` is an expression-starting keyword, so the `<` opens JSX even + // though the byte before it is identifier-like. + let source = "function f() { return
'
; }\n// real"; + assert_eq!(line_comments(source), vec![(38, " real")]); + } + + #[test] + fn jsx_after_export_default_is_detected() { + let source = "export default ';\n// real"; + assert_eq!(line_comments(source), vec![(29, " real")]); + } + + #[test] + fn identifier_ending_in_keyword_is_not_treated_as_jsx() { + // `myreturn` is a value identifier, so `<` is a comparison (the lack of + // a space still forces the keyword check via `next_is_tag`), and the + // quoted text after it is a normal string literal (not JSX text). + let source = "const a = myreturn` is a generic, not JSX; a following comment and + // its quotes must scan normally. + let source = "const x: Record = {};\n// real"; + assert_eq!(line_comments(source), vec![(33, " real")]); + } + + #[test] + fn lone_quote_before_newline_is_not_a_string() { + // A lone quote followed by a newline is not a string literal, so a + // comment on a later line must still be discovered. + let source = "a = ';\nb = '; // c\n"; + assert_eq!(line_comments(source), vec![(14, " c")]); + } + #[test] fn string_containing_backslash_n_is_not_newline() { // The literal text \n in a string (escaped), not a real newline diff --git a/crates/lingui_macro/tests/lingui_directive.rs b/crates/lingui_macro/tests/lingui_directive.rs index 6865119..5aac9ea 100644 --- a/crates/lingui_macro/tests/lingui_directive.rs +++ b/crates/lingui_macro/tests/lingui_directive.rs @@ -253,6 +253,29 @@ to!( "# ); +to!( + jsx_trans_with_unclosed_quotes, + LinguiOptions { + id_prefix_leader: Some(".".into()), + ..Default::default() + }, + r#" + // lingui-set idPrefix="root" + import type { MessageDescriptor } from '@lingui/core' + import { msg, t } from '@lingui/core/macro' + import { Trans } from '@lingui/react/macro' + + const X = () =>

'

+ const Y = () =>

`

+ + // lingui-set idPrefix="different" + const different = { + a: msg({ id: '.a', message: `different a` }), + b: msg({ id: '.b', message: `different b` }), + } as const satisfies Record + "# +); + to!( jsx_trans_with_directive_context, r#" diff --git a/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap b/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap new file mode 100644 index 0000000..47d14ec --- /dev/null +++ b/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap @@ -0,0 +1,36 @@ +--- +source: crates/lingui_macro/tests/lingui_directive.rs +info: + id_prefix_leader: "." +--- +// lingui-set idPrefix="root" +import type { MessageDescriptor } from '@lingui/core' +import { msg, t } from '@lingui/core/macro' +import { Trans } from '@lingui/react/macro' + +const X = () =>

'

+const Y = () =>

`

+ +// lingui-set idPrefix="different" +const different = { + a: msg({ id: '.a', message: `different a` }), + b: msg({ id: '.b', message: `different b` }), +} as const satisfies Record + +↓ ↓ ↓ ↓ ↓ ↓ + +// lingui-set idPrefix="root" +import type { MessageDescriptor } from '@lingui/core'; +const X = ()=>

'

; +const Y = ()=>

`

; +// lingui-set idPrefix="different" +const different = { + a: /*i18n*/ { + id: "different.a", + message: "different a" + }, + b: /*i18n*/ { + id: "different.b", + message: "different b" + } +} as const satisfies Record; From 7c39de07c0f45d9aa1291c7370035f90d75d29d9 Mon Sep 17 00:00:00 2001 From: Victor Hallberg Date: Thu, 18 Jun 2026 15:15:39 +0200 Subject: [PATCH 2/2] fix: address review comments --- .../src/comment_directive/source_scanner.rs | 104 ++++++++++++------ 1 file changed, 70 insertions(+), 34 deletions(-) diff --git a/crates/lingui_macro/src/comment_directive/source_scanner.rs b/crates/lingui_macro/src/comment_directive/source_scanner.rs index 9bafa32..a006a4b 100644 --- a/crates/lingui_macro/src/comment_directive/source_scanner.rs +++ b/crates/lingui_macro/src/comment_directive/source_scanner.rs @@ -71,6 +71,12 @@ pub fn scan_source_comments(source: &str) -> Vec> { // whether a `<` opens a JSX element (expression position) or is a // comparison / TS generic (after a value). let mut last_sig = 0u8; + // Whether the last significant identifier was an expression-starting keyword + // (e.g. `return`, `yield`). Tracked forward — rather than re-scanned + // backward from each `<` — so that comments between the keyword and the `<` + // (`return /* c */
`) are transparently skipped, since comment bytes + // never update this flag. Only consulted when `last_sig` is identifier-like. + let mut prev_is_expr_keyword = false; while index < bytes.len() { match stack.last_mut().expect("scanner stack is never empty") { @@ -139,15 +145,29 @@ pub fn scan_source_comments(source: &str) -> Vec> { last_sig = b'}'; index += 1; } - b'<' if is_jsx_tag_start(bytes, index, last_sig) => { + b'<' if is_jsx_tag_start(bytes, index, last_sig, prev_is_expr_keyword) => { stack.push(Ctx::JsxTag); index += 1; } + b if is_ident_byte(b) => { + // Consume the whole identifier/number run and record whether + // it is an expression-starting keyword. A keyword reached via + // member access (`obj.return`) is a property name, not the + // keyword, so it does not count. + let start = index; + while index < bytes.len() && is_ident_byte(bytes[index]) { + index += 1; + } + last_sig = bytes[index - 1]; + prev_is_expr_keyword = !preceded_by_dot(bytes, start) + && is_expression_keyword(&bytes[start..index]); + } b' ' | b'\t' | b'\r' | b'\n' => { index += 1; } other => { last_sig = other; + prev_is_expr_keyword = false; index += 1; } }, @@ -174,13 +194,12 @@ pub fn scan_source_comments(source: &str) -> Vec> { index += 1; } b',' | b';' => { - // Not a real JSX tag (e.g. a TS generic such as ``): - // treat the `<` as an operator and resume scanning as code. + // Not a real JSX tag (e.g. a TS generic such as ``). + // Pop only the tentative tag frame, returning to the actual + // surrounding context (the code frame that saw the `<`, + // which may itself be a JSX expression container with its own + // brace depth), and re-scan this byte there. stack.pop(); - stack.push(Ctx::Code { - brace_depth: 0, - jsx_expr: false, - }); } _ => { index += 1; @@ -257,14 +276,15 @@ fn is_value_ending(b: u8) -> bool { } /// Whether the `<` at `index` (in code context) opens a JSX element. Requires an -/// expression position (per `last_sig`) and a following tag-name char or `>` -/// (fragment). This matches `.tsx` semantics, where a bare `` is JSX and a -/// generic arrow must be written `` (handled by the `,` bail in `JsxTag`). +/// expression position and a following tag-name char or `>` (fragment). This +/// matches `.tsx` semantics, where a bare `` is JSX and a generic arrow must +/// be written `` (handled by the `,` bail in `JsxTag`). /// -/// When the preceding byte is identifier-like it is normally a value (so `<` is -/// a comparison/generic), unless that identifier is an expression-starting -/// keyword such as `return` — handled by inspecting the full preceding word. -fn is_jsx_tag_start(bytes: &[u8], index: usize, last_sig: u8) -> bool { +/// `last_sig` is the preceding significant byte; `prev_is_expr_keyword` is set by +/// the caller when that byte ends an expression-starting keyword (e.g. `return`). +/// When the preceding byte is identifier-like it is normally a value (so `<` is a +/// comparison/generic) — unless it was such a keyword. +fn is_jsx_tag_start(bytes: &[u8], index: usize, last_sig: u8, prev_is_expr_keyword: bool) -> bool { let next_is_tag = match bytes.get(index + 1) { Some(&b) => b.is_ascii_alphabetic() || b == b'_' || b == b'$' || b == b'>', None => false, @@ -280,29 +300,18 @@ fn is_jsx_tag_start(bytes: &[u8], index: usize, last_sig: u8) -> bool { // The preceding byte ends a value. Only an identifier ending might actually // be an expression-starting keyword (e.g. `return`); other value endings // (`)`, `]`, `.`, quotes) never are. - if !is_ident_byte(last_sig) { - return false; - } - - is_expression_keyword(preceding_word(bytes, index)) + is_ident_byte(last_sig) && prev_is_expr_keyword } -/// The identifier immediately preceding `index`, skipping whitespace. Returns an -/// empty slice if the preceding token is not a bare identifier (e.g. a member -/// access like `obj.return`, which is a property name, not the keyword). -fn preceding_word(bytes: &[u8], index: usize) -> &[u8] { - let mut end = index; - while end > 0 && matches!(bytes[end - 1], b' ' | b'\t' | b'\r' | b'\n') { - end -= 1; - } - let mut start = end; - while start > 0 && is_ident_byte(bytes[start - 1]) { - start -= 1; +/// Whether the identifier starting at `start` is reached via member access +/// (`obj.return`), in which case it is a property name, not a keyword. Skips +/// whitespace so `obj. return` is also recognised as a property access. +fn preceded_by_dot(bytes: &[u8], start: usize) -> bool { + let mut i = start; + while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r' | b'\n') { + i -= 1; } - if start > 0 && bytes[start - 1] == b'.' { - return &[]; - } - &bytes[start..end] + i > 0 && bytes[i - 1] == b'.' } fn skip_string_literal(bytes: &[u8], start: usize) -> usize { @@ -587,6 +596,33 @@ mod tests { assert_eq!(line_comments(source), vec![(38, " real")]); } + #[test] + fn keyword_property_access_is_not_treated_as_jsx() { + // `obj.return` is a property access (a value), so `<` is a comparison, + // not a JSX open tag. + let source = "const a = obj.return` inside a JSX expression container is a generic arrow, not a + // nested element. Bailing must return to *that container* so its closing + // `}` returns to JSX children. Otherwise the lone backtick in the JSX + // text below would be treated as a template opener in stray code and run + // away to EOF, swallowing the comment. + let source = "const x =

{((v)=>v)}`

;\n// real"; + assert_eq!(line_comments(source), vec![(34, " real")]); + } + #[test] fn backtick_in_jsx_text_does_not_swallow_following_comment() { // A backtick is JSX text, not a template-literal opener. Real templates