diff --git a/docs/grammar.md b/docs/grammar.md index e69de29..0a51c9c 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -0,0 +1,465 @@ +# SQL Grammar and Syntax Specification + +This document provides a technical explanation of the PenguinDB SQL grammar. The complete grammar definitions can be obtained in the following formats: + +## Case Insensitivity + +All SQL keywords and unquoted identifiers are case-insensitive. For example, keywords such as `SELECT`, `select`, and `SeLeCt` are evaluated identically. Similarly, unquoted table, column, and database names are resolved case-insensitively. String literals enclosed in single quotes preserve their exact character casing. + +## BNF Grammar + +```bnf + ::= | + ::= + + ::= | | + + ::= | | + ::= | | + ::= | | | + + ::= | + ::= + ::= | + + ::= + | + ::= | + + ::= + ::= | | | + ::= | + ::= | + ::= + ::= | + ::= + + ::= | + + ::= | + + ::= + | + | + | + | + | + | + + ::= + | + | + | + | + + ::= + | + | + + ::= + + ::= | + ::= | + ::= + ::= + + ::= | | + + ::= + | + | + | + + ::= | + + ::= + | + | + | + | + | + | + + ::= + | + | + | + | + + ::= + | + | + | + | + + ::= + | + | + + ::= | + ::= | | + ::= | + + ::= | + ::= | + ::= | | + ::= | + ::= + | + | + ::= + | | + | | + | | + + ::= + | + + ::= | + ::= | + ::= + ::= | + + ::= + | + + ::= | + ::= + + ::= + | + + ::= + ::= + ::= | + ::= | + ::= | + ::= | + ::= | | | | + + ::= + ::= + | + ::= + | + ::= + | + ::= + | + + ::= + | + | + | + | + | + + ::= + ::= | + ::= + + ::= + ::= | + ::= | | + + ::= + | + + ::= | | + ::= | | | + ::= | | + | + | | + + ::= + | + ::= + | | + | | + ::= | + + ::= | + + + ::= + | + | + | + | + | + + ::= | + ::= | | + | | | + + ::= | | | + ::= + ::= | + ::= | + ::= + ::= | | + ::= | + ::= + | + ::= | + ::= | + ::= (* any character from the source character set except *) + + ::= | + ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" + | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" + ::= "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" + | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" + ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" + + ::= "CREATE" + ::= "DATABASE" + ::= "USE" + ::= "DROP" + ::= "IF" + ::= "EXISTS" + + ::= "TABLE" + ::= "ALTER" + ::= "ADD" + ::= "COLUMN" + ::= "MODIFY" + ::= "RENAME" + ::= "TO" + + ::= "SELECT" + ::= "DISTINCT" + ::= "ALL" + ::= "FROM" + ::= "WHERE" + ::= "AS" + ::= "INSERT" + ::= "INTO" + ::= "VALUES" + ::= "UPDATE" + ::= "SET" + ::= "DELETE" + + ::= "JOIN" + ::= "INNER" + ::= "LEFT" + ::= "RIGHT" + ::= "FULL" + ::= "OUTER" + ::= "CROSS" + ::= "ON" + + ::= "GROUP" + ::= "HAVING" + ::= "ORDER" + ::= "BY" + ::= "ASC" + ::= "DESC" + ::= "LIMIT" + ::= "OFFSET" + + ::= "PRIMARY" + ::= "KEY" + ::= "NOT" + ::= "NULL" + ::= "DEFAULT" + ::= "UNIQUE" + ::= "REFERENCES" + + ::= "AND" + ::= "OR" + ::= "TRUE" + ::= "FALSE" + ::= "LIKE" + ::= "IS" + ::= "IN" + ::= "BETWEEN" + + ::= "INT" + ::= "BIGINT" + ::= "VARCHAR" + ::= "BOOLEAN" + ::= "TEXT" + ::= "TIMESTAMP" + + ::= "(" + ::= ")" + ::= "," + ::= "." + ::= ";" + ::= "'" + ::= "\"" + + ::= "=" + ::= "!=" | "<>" + ::= "<" + ::= ">" + ::= "<=" + ::= ">=" + + ::= "+" + ::= "-" + ::= "*" + ::= "/" + ::= "%" + + ::= "_" + + ::= ";" +``` + +## EBNF Form + +A more readable EBNF form of the grammar is given below: + +```ebnf +Program ::= Statement+ +Statement ::= ManipulationStatement ';' + +ManipulationStatement ::= DbManipulationStatement + | TableManipulationStatement + | DataManipulationStatement + +DbManipulationStatement ::= 'CREATE' 'DATABASE' ( 'IF' 'NOT' 'EXISTS' )? Identifier + | 'DROP' 'DATABASE' ( 'IF' 'EXISTS' )? Identifier + | 'USE' Identifier +TableManipulationStatement ::= CreateTableStatement | AlterTableStatement | DropTableStatement +DataManipulationStatement ::= InsertStatement | SelectStatement | UpdateStatement | DeleteStatement + +CreateTableStatement ::= 'CREATE' 'TABLE' ( 'IF' 'NOT' 'EXISTS' )? Identifier '(' ColumnDefinition ( ',' ColumnDefinition )* ')' + +AlterTableStatement ::= 'ALTER' 'TABLE' Identifier AlterAction +AlterAction ::= ( 'ADD' | 'MODIFY' ) 'COLUMN'? ColumnDefinition + | 'RENAME' ( 'TO' Identifier | 'COLUMN' Identifier 'TO' Identifier ) + | 'DROP' 'COLUMN' Identifier + +DropTableStatement ::= 'DROP' 'TABLE' ( 'IF' 'EXISTS' )? Identifier + +ColumnDefinition ::= Identifier DataType ColumnConstraints? + +ColumnConstraints ::= KeyConstraint NullConstraint? DefaultConstraint? ForeignConstraint? + | NullConstraint DefaultConstraint? ForeignConstraint? + | DefaultConstraint ForeignConstraint? + | ForeignConstraint + +KeyConstraint ::= 'PRIMARY' 'KEY' | 'UNIQUE' +NullConstraint ::= 'NOT' 'NULL' | 'NULL' +DefaultConstraint ::= 'DEFAULT' SignedLiteral +ForeignConstraint ::= 'REFERENCES' Identifier '(' Identifier ')' + +SignedLiteral ::= Literal | ( '+' | '-' ) NumericLiteral + +SelectStatement ::= 'SELECT' ( 'DISTINCT' | 'ALL' )? SelectList + 'FROM' TableReference ( ',' TableReference )* + WhereClause? + GroupByClause? + HavingClause? + OrderByClause? + LimitClause? +SelectList ::= SelectColumn ( ',' SelectColumn )* +SelectColumn ::= '*' | SelectExpression ( 'AS' Identifier )? +SelectExpression ::= Expression | Condition + +InsertStatement ::= 'INSERT' 'INTO' Identifier + ( '(' Identifier ( ',' Identifier )* ')' )? + 'VALUES' ValueRow ( ',' ValueRow )* + +ValueRow ::= '(' Expression ( ',' Expression )* ')' + +UpdateStatement ::= 'UPDATE' Identifier 'SET' SetItem ( ',' SetItem )* WhereClause? +SetItem ::= QualifiedIdentifier '=' Expression + +DeleteStatement ::= 'DELETE' 'FROM' Identifier WhereClause? + +TableReference ::= TablePrimary ( JoinClause )* +TablePrimary ::= Identifier ( ( 'AS' )? Identifier )? +JoinClause ::= JoinType? 'JOIN' TablePrimary 'ON' Condition +JoinType ::= 'INNER' | 'LEFT' 'OUTER'? | 'RIGHT' 'OUTER'? | 'FULL' 'OUTER'? | 'CROSS' + +WhereClause ::= 'WHERE' Condition +Condition ::= OrCondition +OrCondition ::= AndCondition ( 'OR' AndCondition )* +AndCondition ::= NotCondition ( 'AND' NotCondition )* +NotCondition ::= ConditionPrimary | 'NOT' NotCondition +ConditionPrimary ::= Predicate | '(' Condition ')' +Predicate ::= ComparisonPredicate + | LikePredicate + | NullPredicate + | InPredicate + | BetweenPredicate +ComparisonPredicate ::= Expression ComparisonOperator Expression +LikePredicate ::= Expression 'NOT'? 'LIKE' Expression +NullPredicate ::= Expression 'IS' 'NOT'? 'NULL' +InPredicate ::= Expression 'NOT'? 'IN' '(' Expression ( ',' Expression )* ')' +BetweenPredicate ::= Expression 'NOT'? 'BETWEEN' Expression 'AND' Expression +ComparisonOperator ::= '=' | '!=' | '<>' | '<' | '>' | '<=' | '>=' + +GroupByClause ::= 'GROUP' 'BY' QualifiedIdentifier ( ',' QualifiedIdentifier )* +HavingClause ::= 'HAVING' Condition +OrderByClause ::= 'ORDER' 'BY' OrderByItem ( ',' OrderByItem )* +OrderByItem ::= Expression ( 'ASC' | 'DESC' )? +LimitClause ::= 'LIMIT' IntegerLiteral ( 'OFFSET' IntegerLiteral )? + +Expression ::= Term ( ( '+' | '-' ) Term )* +Term ::= Factor ( ( '*' | '/' | '%' ) Factor )* +Factor ::= Literal + | QualifiedIdentifier + | FunctionCall + | '(' Expression ')' + | ( '+' | '-' ) Factor + +FunctionCall ::= Identifier '(' FunctionArgs? ')' +FunctionArgs ::= '*' | ( 'DISTINCT' )? Expression ( ',' Expression )* + +QualifiedIdentifier ::= Identifier ( '.' Identifier )? + +DataType ::= 'INT' + | 'BIGINT' + | 'VARCHAR' '(' IntegerLiteral ')' + | 'BOOLEAN' + | 'TEXT' + | 'TIMESTAMP' + +Identifier ::= Letter ( Letter | Digit | '_' )* + +Literal ::= NumericLiteral | StringLiteral | BooleanLiteral | NullLiteral +NullLiteral ::= 'NULL' +BooleanLiteral ::= 'TRUE' | 'FALSE' +NumericLiteral ::= IntegerLiteral | FloatLiteral +IntegerLiteral ::= Digit+ +FloatLiteral ::= Digit+ '.' Digit+ | Digit+ '.' | '.' Digit+ +StringLiteral ::= "'" StringChar* "'" +StringChar ::= NonQuoteCharacter | "''" +NonQuoteCharacter ::= (* any character except single-quote *) + +Letter ::= LowercaseLetter | UppercaseLetter +LowercaseLetter ::= 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' + | 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' +UppercaseLetter ::= 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' + | 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' +Digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' + +``` + +### Notes + +- **Program**: The top-level rule. A program is one or more semicolon-terminated statements, enabling scripts with multiple SQL statements separated by `;`. +- **IF EXISTS / IF NOT EXISTS**: `CREATE DATABASE`, `CREATE TABLE` accept an optional `IF NOT EXISTS` clause to suppress errors when the target already exists. `DROP DATABASE`, `DROP TABLE` accept an optional `IF EXISTS` clause to suppress errors when the target does not exist. +- **Identifier / QualifiedIdentifier**: `Identifier` governs database, table, and column names. Must begin with a letter and may include letters, digits, and underscores. `QualifiedIdentifier` extends this to support dot-separated `table.column` references such as `users.id` or `orders.total`. Qualified identifiers are used in `Factor`, `SetItem`, and `GROUP BY`. +- **Literal**: Denotes fixed data values. +- **NullLiteral / BooleanLiteral**: Captures SQL boolean flags (`TRUE`/`FALSE`) and the missing-data marker (`NULL`). +- **NumericLiteral / IntegerLiteral / FloatLiteral**: Governs integer and fractional digits. `FloatLiteral` accepts all three forms SQL allows: standard (`3.14`), leading-dot (`.14`), and trailing-dot (`10.`). Only `IntegerLiteral` is accepted by `LIMIT`, `OFFSET`, and `VARCHAR`. +- **StringLiteral / NonQuoteCharacter**: Resolves single-quoted text values. An empty string `''` is valid. To embed a literal single quote inside a string, double it: `'it''s'` represents `it's`. Per the SQL standard, `NonQuoteCharacter` is any character from the source character set except the single-quote delimiter. In the grammar this is expressed via `StringChar ::= NonQuoteCharacter | "''"`, where `''` is treated as a single escaped-quote unit by the lexer using a greedy longest-match rule. +- **SignedLiteral**: Supports both unary `+` and `-` for numeric literals in `DEFAULT` values: `DEFAULT -1`, `DEFAULT +5`. +- **SelectStatement**: Supports an optional `DISTINCT` or `ALL` quantifier after `SELECT`, absorbed directly into the four `` alternatives rather than via a nullable rule. The optional clause tail is expressed through four non-nullable helper rules — ``, ``, ``, and `` — each enumerating only the valid non-empty suffixes that may follow a given clause. Together they cover all 23 valid non-empty clause combinations while enforcing canonical ordering (`WHERE → GROUP BY → HAVING → ORDER BY → LIMIT`). `HAVING` is only reachable through ``, so `GROUP BY` before `HAVING` is structurally guaranteed. No nullable rules are used anywhere in the BNF. +- **SelectList / SelectColumn / SelectExpression**: Each item in a select list is independently a `SelectColumn`, which can be a bare `*` or any `SelectExpression` with an optional `AS` alias. A `SelectExpression` may be an arithmetic `Expression` or a boolean `Condition`. +- **TableReference / JoinClause**: A `TableReference` is a `TablePrimary` (an identifier with an optional alias) followed by zero or more `JoinClause`s. Supported join types are: `INNER`, `LEFT [OUTER]`, `RIGHT [OUTER]`, `FULL [OUTER]`, and `CROSS`. All non-cross joins require an `ON` condition. +- **Predicate**: The grammar supports five predicate types: `ComparisonPredicate` (`=`, `!=`, `<>`, `<`, `>`, `<=`, `>=`), `LikePredicate` (`LIKE` / `NOT LIKE`), `NullPredicate` (`IS NULL` / `IS NOT NULL`), `InPredicate` (`IN` / `NOT IN`), and `BetweenPredicate` (`BETWEEN ... AND ...` / `NOT BETWEEN ... AND ...`). +- **GROUP BY / HAVING**: `GROUP BY` accepts a comma-separated list of qualified identifiers. `HAVING` filters groups using a condition and may only appear after `GROUP BY`. +- **ORDER BY**: Accepts a comma-separated list of order items. Each item is an expression with an optional `ASC` (ascending, default) or `DESC` (descending) direction. +- **LIMIT / OFFSET**: `LIMIT` restricts the result set size. An optional `OFFSET` clause skips a specified number of rows before returning results. Both accept only `IntegerLiteral` values. +- **FunctionCall**: Supports general function call syntax: `identifier(args)`. Function arguments can be a bare `*` (for `COUNT(*)`), or one or more expressions optionally preceded by `DISTINCT` (for `COUNT(DISTINCT col)`). This covers all standard aggregate functions (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`) and any future scalar functions. +- **ColumnConstraints**: Supports four constraint types — key, null, default, and foreign — each of which may appear at most once per column. Constraints must be written in canonical order: `KeyConstraint` → `NullConstraint` → `DefaultConstraint` → `ForeignConstraint`. The grammar encodes all 15 valid non-empty subsets of these four types in that fixed order. **Parser note**: the parser must verify at semantic analysis time that no constraint type is duplicated; the grammar structure alone enforces canonical ordering but does not prevent a user from writing the same constraint twice if the grammar were extended permissively. +- **NullConstraint**: Accepts both `NOT NULL` and explicit `NULL`. While `NULL` is the default column behavior, explicitly stating it is valid SQL and commonly used in schema definitions. +- **ForeignConstraint**: Column-level referential constraint. Syntax: `REFERENCES table_name (column_name)`, pointing to exactly one column in another table. +- **Letter / Digit**: Fundamental character classes for identifiers.