diff --git a/crates/lingui_macro/src/comment_directive/source_scanner.rs b/crates/lingui_macro/src/comment_directive/source_scanner.rs index 94bd03f..a006a4b 100644 --- a/crates/lingui_macro/src/comment_directive/source_scanner.rs +++ b/crates/lingui_macro/src/comment_directive/source_scanner.rs @@ -9,66 +9,311 @@ pub enum CommentKind { Block, } +/// Lexer context. JSX must be tracked because a quote or backtick inside JSX +/// text (e.g. `

'

` or ``

`

``) is literal content, not a string or +/// template literal. Treating it as a literal opener would scan ahead to the +/// next matching delimiter and swallow any comments — including lingui +/// directives — in between. +enum Ctx { + /// Ordinary JS/TS code. `brace_depth` counts nested `{`; when this frame is + /// a JSX expression container (`jsx_expr`), the `}` at depth 0 returns to + /// the enclosing JSX children. + Code { brace_depth: u32, jsx_expr: bool }, + /// Inside a JSX tag, between `<` and its closing `>` / `/>`. + JsxTag, + /// Inside JSX element children (text). Quotes, backticks and comment + /// markers are all literal text here. + JsxChildren, +} + +/// Extracts every line and block comment from `source`, in source order, with +/// byte offsets relative to the start of `source`. +/// +/// # Why this is a hand-rolled scanner +/// +/// Comments are needed to locate `lingui-set` / `lingui-reset` directives and +/// associate each with the byte position it precedes. SWC has of course already +/// parsed the file and knows every comment — but the `Comments` trait (and the +/// `PluginCommentsProxy` the plugin actually receives in the WASM host) only +/// supports *lookup by position*, not *enumeration*. There is no API to ask +/// "give me all comments in order", so we cannot reuse SWC's comment table here +/// and must recover the comments from the raw source ourselves. +/// +/// # Why it can't be a naive `//` / `/*` search +/// +/// Comment markers also appear inside string literals (`"https://…"`), template +/// literals, and regular text — so the scanner has to understand enough lexical +/// structure to skip those. The tricky part is JSX: a quote or backtick in JSX +/// *text* (`

'

`, ``

`

``) is literal content, not a string/template +/// opener. A flat scanner that skips every `'`/`"`/`` ` `` as a literal would, +/// on hitting such a character, scan ahead to the next matching delimiter and +/// swallow any comments — including directives — in between. That is exactly the +/// bug this scanner exists to avoid, so it tracks JSX context (see [`Ctx`]). +/// +/// # Limitations +/// +/// This is a pragmatic lexer, not a full parser. It does not handle regular +/// expression literals (a `/regex/` is treated as division, which at worst +/// misreads its contents but never the surrounding directives), and JSX +/// detection is heuristic (see [`is_jsx_tag_start`]). These trade-offs are +/// acceptable because the only consumers are lingui directive comments, and a +/// misclassification can at most cause a directive to be missed — never applied +/// to the wrong code. pub fn scan_source_comments(source: &str) -> Vec> { let bytes = source.as_bytes(); let mut comments = Vec::new(); let mut index = 0usize; + let mut stack: Vec = vec![Ctx::Code { + brace_depth: 0, + jsx_expr: false, + }]; + // Last significant (non-whitespace, non-comment) code byte. Used to decide + // whether a `<` opens a JSX element (expression position) or is a + // comparison / TS generic (after a value). + let mut last_sig = 0u8; + // Whether the last significant identifier was an expression-starting keyword + // (e.g. `return`, `yield`). Tracked forward — rather than re-scanned + // backward from each `<` — so that comments between the keyword and the `<` + // (`return /* c */
`) are transparently skipped, since comment bytes + // never update this flag. Only consulted when `last_sig` is identifier-like. + let mut prev_is_expr_keyword = false; while index < bytes.len() { - match bytes[index] { - b'\'' | b'"' => { - index = skip_string_literal(bytes, index); - } - b'`' => { - index = skip_template_literal(bytes, index); - } - b'/' if bytes.get(index + 1) == Some(&b'/') => { - let comment_start = index; - let content_start = index + 2; - index = content_start; - - while index < bytes.len() && bytes[index] != b'\n' { - index += 1; + match stack.last_mut().expect("scanner stack is never empty") { + Ctx::Code { + brace_depth, + jsx_expr, + } => match bytes[index] { + b'\'' | b'"' => { + index = skip_string_literal(bytes, index); + last_sig = b'"'; } + b'`' => { + index = skip_template_literal(bytes, index); + last_sig = b'`'; + } + b'/' if bytes.get(index + 1) == Some(&b'/') => { + let comment_start = index; + let content_start = index + 2; + index = content_start; - comments.push(SourceComment { - byte_offset: comment_start, - content: &source[content_start..index], - kind: CommentKind::Line, - }); - } - b'/' if bytes.get(index + 1) == Some(&b'*') => { - let comment_start = index; - let content_start = index + 2; - index = content_start; - - while index < bytes.len() { - if bytes[index] == b'*' && bytes.get(index + 1) == Some(&b'/') { - break; + while index < bytes.len() && bytes[index] != b'\n' { + index += 1; } - index += 1; + + comments.push(SourceComment { + byte_offset: comment_start, + content: &source[content_start..index], + kind: CommentKind::Line, + }); } + b'/' if bytes.get(index + 1) == Some(&b'*') => { + let comment_start = index; + let content_start = index + 2; + index = content_start; + + while index < bytes.len() { + if bytes[index] == b'*' && bytes.get(index + 1) == Some(&b'/') { + break; + } + index += 1; + } + + let content_end = index; + if index < bytes.len() { + index += 2; + } - let content_end = index; - if index < bytes.len() { + comments.push(SourceComment { + byte_offset: comment_start, + content: &source[content_start..content_end], + kind: CommentKind::Block, + }); + } + b'{' => { + *brace_depth += 1; + last_sig = b'{'; + index += 1; + } + b'}' => { + if *brace_depth > 0 { + *brace_depth -= 1; + } else if *jsx_expr { + // Closes a JSX expression container; back to children. + stack.pop(); + } + last_sig = b'}'; + index += 1; + } + b'<' if is_jsx_tag_start(bytes, index, last_sig, prev_is_expr_keyword) => { + stack.push(Ctx::JsxTag); + index += 1; + } + b if is_ident_byte(b) => { + // Consume the whole identifier/number run and record whether + // it is an expression-starting keyword. A keyword reached via + // member access (`obj.return`) is a property name, not the + // keyword, so it does not count. + let start = index; + while index < bytes.len() && is_ident_byte(bytes[index]) { + index += 1; + } + last_sig = bytes[index - 1]; + prev_is_expr_keyword = !preceded_by_dot(bytes, start) + && is_expression_keyword(&bytes[start..index]); + } + b' ' | b'\t' | b'\r' | b'\n' => { + index += 1; + } + other => { + last_sig = other; + prev_is_expr_keyword = false; + index += 1; + } + }, + Ctx::JsxTag => match bytes[index] { + b'\'' | b'"' => { + index = skip_string_literal(bytes, index); + } + b'{' => { + stack.push(Ctx::Code { + brace_depth: 0, + jsx_expr: true, + }); + index += 1; + } + b'/' if bytes.get(index + 1) == Some(&b'>') => { + // Self-closing tag: no children. + stack.pop(); index += 2; + last_sig = b']'; } - - comments.push(SourceComment { - byte_offset: comment_start, - content: &source[content_start..content_end], - kind: CommentKind::Block, - }); - } - _ => { - index += 1; - } + b'>' => { + stack.pop(); + stack.push(Ctx::JsxChildren); + index += 1; + } + b',' | b';' => { + // Not a real JSX tag (e.g. a TS generic such as ``). + // Pop only the tentative tag frame, returning to the actual + // surrounding context (the code frame that saw the `<`, + // which may itself be a JSX expression container with its own + // brace depth), and re-scan this byte there. + stack.pop(); + } + _ => { + index += 1; + } + }, + Ctx::JsxChildren => match bytes[index] { + b'{' => { + stack.push(Ctx::Code { + brace_depth: 0, + jsx_expr: true, + }); + index += 1; + } + b'<' if bytes.get(index + 1) == Some(&b'/') => { + // Closing tag ``: skip to `>` and leave children. + index += 2; + while index < bytes.len() && bytes[index] != b'>' { + index += 1; + } + if index < bytes.len() { + index += 1; + } + stack.pop(); + last_sig = b']'; + } + b'<' => { + // Nested element open tag. + stack.push(Ctx::JsxTag); + index += 1; + } + _ => { + // Text content — quotes/backticks/`//` are literal here. + index += 1; + } + }, } } comments } +fn is_ident_byte(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' || b == b'$' +} + +/// Keywords after which a `<` begins an expression, so it may open a JSX element +/// (`return
`, `export default `, `yield `, …). In a comparison or +/// generic the byte before `<` is an operand identifier, never one of these, so +/// allowing them does not reintroduce false positives. +fn is_expression_keyword(word: &[u8]) -> bool { + matches!( + word, + b"return" + | b"yield" + | b"await" + | b"void" + | b"typeof" + | b"delete" + | b"throw" + | b"do" + | b"else" + | b"case" + | b"default" + | b"in" + | b"of" + | b"instanceof" + ) +} + +/// A byte that ends a value/expression, after which a `<` is a comparison or TS +/// generic rather than the start of a JSX element. +fn is_value_ending(b: u8) -> bool { + is_ident_byte(b) || matches!(b, b')' | b']' | b'.' | b'"' | b'\'' | b'`') +} + +/// Whether the `<` at `index` (in code context) opens a JSX element. Requires an +/// expression position and a following tag-name char or `>` (fragment). This +/// matches `.tsx` semantics, where a bare `` is JSX and a generic arrow must +/// be written `` (handled by the `,` bail in `JsxTag`). +/// +/// `last_sig` is the preceding significant byte; `prev_is_expr_keyword` is set by +/// the caller when that byte ends an expression-starting keyword (e.g. `return`). +/// When the preceding byte is identifier-like it is normally a value (so `<` is a +/// comparison/generic) — unless it was such a keyword. +fn is_jsx_tag_start(bytes: &[u8], index: usize, last_sig: u8, prev_is_expr_keyword: bool) -> bool { + let next_is_tag = match bytes.get(index + 1) { + Some(&b) => b.is_ascii_alphabetic() || b == b'_' || b == b'$' || b == b'>', + None => false, + }; + if !next_is_tag { + return false; + } + + if !is_value_ending(last_sig) { + return true; + } + + // The preceding byte ends a value. Only an identifier ending might actually + // be an expression-starting keyword (e.g. `return`); other value endings + // (`)`, `]`, `.`, quotes) never are. + is_ident_byte(last_sig) && prev_is_expr_keyword +} + +/// Whether the identifier starting at `start` is reached via member access +/// (`obj.return`), in which case it is a property name, not a keyword. Skips +/// whitespace so `obj. return` is also recognised as a property access. +fn preceded_by_dot(bytes: &[u8], start: usize) -> bool { + let mut i = start; + while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r' | b'\n') { + i -= 1; + } + i > 0 && bytes[i - 1] == b'.' +} + fn skip_string_literal(bytes: &[u8], start: usize) -> usize { let delim = bytes[start]; let mut i = start + 1; @@ -77,6 +322,14 @@ fn skip_string_literal(bytes: &[u8], start: usize) -> usize { i = (i + 2).min(bytes.len()); } else if bytes[i] == delim { return i + 1; + } else if bytes[i] == b'\n' { + // A single/double-quoted JS string cannot contain an unescaped + // newline. Hitting one means the opening quote was not a string + // delimiter (e.g. an apostrophe in JSX text like `'`). + // Treat it as a plain character so scanning resumes — otherwise we + // would skip ahead to the next stray quote and swallow any comments + // (including lingui directives) in between. + return start + 1; } else { i += 1; } @@ -312,6 +565,96 @@ mod tests { assert_eq!(line_comments(source), vec![(39, " real")]); } + #[test] + fn quote_in_jsx_text_does_not_swallow_following_comment() { + // The apostrophe is JSX text, not a string opener. A real string can't + // span the newline, so the following comment must still be found. + let source = "const x = ';\n// real"; + assert_eq!(line_comments(source), vec![(28, " real")]); + } + + #[test] + fn quote_in_jsx_text_after_return_keyword_does_not_swallow_comment() { + // `return` is an expression-starting keyword, so the `<` opens JSX even + // though the byte before it is identifier-like. + let source = "function f() { return
'
; }\n// real"; + assert_eq!(line_comments(source), vec![(38, " real")]); + } + + #[test] + fn jsx_after_export_default_is_detected() { + let source = "export default ';\n// real"; + assert_eq!(line_comments(source), vec![(29, " real")]); + } + + #[test] + fn identifier_ending_in_keyword_is_not_treated_as_jsx() { + // `myreturn` is a value identifier, so `<` is a comparison (the lack of + // a space still forces the keyword check via `next_is_tag`), and the + // quoted text after it is a normal string literal (not JSX text). + let source = "const a = myreturn` inside a JSX expression container is a generic arrow, not a + // nested element. Bailing must return to *that container* so its closing + // `}` returns to JSX children. Otherwise the lone backtick in the JSX + // text below would be treated as a template opener in stray code and run + // away to EOF, swallowing the comment. + let source = "const x =

{((v)=>v)}`

;\n// real"; + assert_eq!(line_comments(source), vec![(34, " real")]); + } + + #[test] + fn backtick_in_jsx_text_does_not_swallow_following_comment() { + // A backtick is JSX text, not a template-literal opener. Real templates + // span newlines, so only JSX awareness (not a newline bail) saves us. + let source = "const x = () =>

`

;\n// real"; + assert_eq!(line_comments(source), vec![(26, " real")]); + } + + #[test] + fn template_literal_inside_jsx_expression_is_still_skipped() { + // Inside a JSX expression container `{...}` we are back in code, so a + // real template literal must still be skipped (its `//` is not a comment). + let source = "const x =

{`// not a comment`}

;\n// real"; + assert_eq!(line_comments(source), vec![(39, " real")]); + } + + #[test] + fn ts_generic_is_not_mistaken_for_jsx() { + // `Record` is a generic, not JSX; a following comment and + // its quotes must scan normally. + let source = "const x: Record = {};\n// real"; + assert_eq!(line_comments(source), vec![(33, " real")]); + } + + #[test] + fn lone_quote_before_newline_is_not_a_string() { + // A lone quote followed by a newline is not a string literal, so a + // comment on a later line must still be discovered. + let source = "a = ';\nb = '; // c\n"; + assert_eq!(line_comments(source), vec![(14, " c")]); + } + #[test] fn string_containing_backslash_n_is_not_newline() { // The literal text \n in a string (escaped), not a real newline diff --git a/crates/lingui_macro/tests/lingui_directive.rs b/crates/lingui_macro/tests/lingui_directive.rs index 6865119..5aac9ea 100644 --- a/crates/lingui_macro/tests/lingui_directive.rs +++ b/crates/lingui_macro/tests/lingui_directive.rs @@ -253,6 +253,29 @@ to!( "# ); +to!( + jsx_trans_with_unclosed_quotes, + LinguiOptions { + id_prefix_leader: Some(".".into()), + ..Default::default() + }, + r#" + // lingui-set idPrefix="root" + import type { MessageDescriptor } from '@lingui/core' + import { msg, t } from '@lingui/core/macro' + import { Trans } from '@lingui/react/macro' + + const X = () =>

'

+ const Y = () =>

`

+ + // lingui-set idPrefix="different" + const different = { + a: msg({ id: '.a', message: `different a` }), + b: msg({ id: '.b', message: `different b` }), + } as const satisfies Record + "# +); + to!( jsx_trans_with_directive_context, r#" diff --git a/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap b/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap new file mode 100644 index 0000000..47d14ec --- /dev/null +++ b/crates/lingui_macro/tests/snapshots/lingui_directive__jsx_trans_with_unclosed_quotes.snap @@ -0,0 +1,36 @@ +--- +source: crates/lingui_macro/tests/lingui_directive.rs +info: + id_prefix_leader: "." +--- +// lingui-set idPrefix="root" +import type { MessageDescriptor } from '@lingui/core' +import { msg, t } from '@lingui/core/macro' +import { Trans } from '@lingui/react/macro' + +const X = () =>

'

+const Y = () =>

`

+ +// lingui-set idPrefix="different" +const different = { + a: msg({ id: '.a', message: `different a` }), + b: msg({ id: '.b', message: `different b` }), +} as const satisfies Record + +↓ ↓ ↓ ↓ ↓ ↓ + +// lingui-set idPrefix="root" +import type { MessageDescriptor } from '@lingui/core'; +const X = ()=>

'

; +const Y = ()=>

`

; +// lingui-set idPrefix="different" +const different = { + a: /*i18n*/ { + id: "different.a", + message: "different a" + }, + b: /*i18n*/ { + id: "different.b", + message: "different b" + } +} as const satisfies Record;