diff --git a/crates/lib-dialects/src/ansi.rs b/crates/lib-dialects/src/ansi.rs index 3de1387e8..002c2a2c2 100644 --- a/crates/lib-dialects/src/ansi.rs +++ b/crates/lib-dialects/src/ansi.rs @@ -356,15 +356,18 @@ pub fn raw_dialect() -> Dialect { let pattern = reserved_keywords.iter().join("|"); let anti_template = format!("^({pattern})$"); - RegexParser::new("[A-Z0-9_]*[A-Z][A-Z0-9_]*", SyntaxKind::NakedIdentifier) - .anti_template(&anti_template) - .to_matchable() + RegexParser::new( + "[\\p{L}\\p{N}_]*[\\p{L}][\\p{L}\\p{N}_]*", + SyntaxKind::NakedIdentifier, + ) + .anti_template(&anti_template) + .to_matchable() }) .into(), ), ( "ParameterNameSegment".into(), - RegexParser::new(r#"\"?[A-Z][A-Z0-9_]*\"?"#, SyntaxKind::Parameter) + RegexParser::new(r#"\"?[\p{L}][\p{L}\p{N}_]*\"?"#, SyntaxKind::Parameter) .to_matchable() .into(), ), @@ -383,7 +386,7 @@ pub fn raw_dialect() -> Dialect { let anti_template = format!("^({})$", "NOT"); one_of(vec![ - RegexParser::new("[A-Z_][A-Z0-9_]*", SyntaxKind::DataTypeIdentifier) + RegexParser::new("[\\p{L}_][\\p{L}\\p{N}_]*", SyntaxKind::DataTypeIdentifier) .anti_template(&anti_template) .to_matchable(), Ref::new("SingleIdentifierGrammar") @@ -5370,7 +5373,7 @@ fn lexer_matchers() -> Vec { Matcher::string("end_curly_bracket", "}", SyntaxKind::EndCurlyBracket), Matcher::string("colon", ":", SyntaxKind::Colon), Matcher::string("semicolon", ";", SyntaxKind::Semicolon), - Matcher::regex("word", "[0-9a-zA-Z_]+", SyntaxKind::Word), + Matcher::regex("word", "[\\p{L}\\p{N}_]+", SyntaxKind::Word), ] } diff --git a/crates/lib-dialects/src/postgres.rs b/crates/lib-dialects/src/postgres.rs index bbac58992..26cec7d19 100644 --- a/crates/lib-dialects/src/postgres.rs +++ b/crates/lib-dialects/src/postgres.rs @@ -105,7 +105,7 @@ pub fn raw_dialect() -> Dialect { SyntaxKind::SingleQuote, ), Matcher::regex("double_quote", r#"(?s)".+?""#, SyntaxKind::DoubleQuote), - Matcher::regex("word", r"[a-zA-Z_][0-9a-zA-Z_$]*", SyntaxKind::Word), + Matcher::regex("word", r"[\p{L}_][\p{L}\p{N}_$]*", SyntaxKind::Word), ]); let keywords = postgres_keywords(); @@ -402,7 +402,7 @@ pub fn raw_dialect() -> Dialect { let anti_template = format!("^({pattern})$"); RegexParser::new( - r"([A-Z_]+|[0-9]+[A-Z_$])[A-Z0-9_$]*", + r"([\p{L}_]+|[\p{N}]+[\p{L}_$])[\p{L}\p{N}_$]*", SyntaxKind::NakedIdentifier, ) .anti_template(&anti_template) @@ -412,15 +412,21 @@ pub fn raw_dialect() -> Dialect { ), ( "ParameterNameSegment".into(), - RegexParser::new(r#"[A-Z_][A-Z0-9_$]*|\"[^\"]*\""#, SyntaxKind::Parameter) - .to_matchable() - .into(), + RegexParser::new( + r#"[\p{L}_][\p{L}\p{N}_$]*|\"[^\"]*\""#, + SyntaxKind::Parameter, + ) + .to_matchable() + .into(), ), ( "FunctionNameIdentifierSegment".into(), - RegexParser::new(r"[A-Z_][A-Z0-9_$]*", SyntaxKind::FunctionNameIdentifier) - .to_matchable() - .into(), + RegexParser::new( + r"[\p{L}_][\p{L}\p{N}_$]*", + SyntaxKind::FunctionNameIdentifier, + ) + .to_matchable() + .into(), ), ( "FunctionContentsExpressionGrammar".into(), diff --git a/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.sql b/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.sql new file mode 100644 index 000000000..420957387 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.sql @@ -0,0 +1,7 @@ +SELECT 日本語 FROM テーブル; + +SELECT 表1.列1 FROM 表1; + +SELECT café FROM naïve_table; + +SELECT über, straße FROM données; diff --git a/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.yml b/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.yml new file mode 100644 index 000000000..1ab6835c3 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/ansi/sqruff/unicode_identifiers.yml @@ -0,0 +1,67 @@ +file: +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 日本語 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: テーブル +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 表1 + - dot: . + - naked_identifier: 列1 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: 表1 +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: café + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: naïve_table +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: über + - comma: ',' + - select_clause_element: + - column_reference: + - naked_identifier: straße + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: données +- statement_terminator: ; diff --git a/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.sql b/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.sql new file mode 100644 index 000000000..420957387 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.sql @@ -0,0 +1,7 @@ +SELECT 日本語 FROM テーブル; + +SELECT 表1.列1 FROM 表1; + +SELECT café FROM naïve_table; + +SELECT über, straße FROM données; diff --git a/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.yml b/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.yml new file mode 100644 index 000000000..1ab6835c3 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/duckdb/sqruff/unicode_identifiers.yml @@ -0,0 +1,67 @@ +file: +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 日本語 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: テーブル +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 表1 + - dot: . + - naked_identifier: 列1 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: 表1 +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: café + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: naïve_table +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: über + - comma: ',' + - select_clause_element: + - column_reference: + - naked_identifier: straße + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: données +- statement_terminator: ; diff --git a/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.sql b/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.sql new file mode 100644 index 000000000..420957387 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.sql @@ -0,0 +1,7 @@ +SELECT 日本語 FROM テーブル; + +SELECT 表1.列1 FROM 表1; + +SELECT café FROM naïve_table; + +SELECT über, straße FROM données; diff --git a/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.yml b/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.yml new file mode 100644 index 000000000..1ab6835c3 --- /dev/null +++ b/crates/lib-dialects/test/fixtures/dialects/postgres/sqruff/unicode_identifiers.yml @@ -0,0 +1,67 @@ +file: +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 日本語 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: テーブル +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: 表1 + - dot: . + - naked_identifier: 列1 + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: 表1 +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: café + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: naïve_table +- statement_terminator: ; +- statement: + - select_statement: + - select_clause: + - keyword: SELECT + - select_clause_element: + - column_reference: + - naked_identifier: über + - comma: ',' + - select_clause_element: + - column_reference: + - naked_identifier: straße + - from_clause: + - keyword: FROM + - from_expression: + - from_expression_element: + - table_expression: + - table_reference: + - naked_identifier: données +- statement_terminator: ;