diff --git a/README.md b/README.md index bce6270..eaf22c3 100644 --- a/README.md +++ b/README.md @@ -14,15 +14,15 @@ This extension adds support for an EBNF-like syntax ([Extended Backus-Naur Form] - [Comments](#comments) - [Rules](#rules) - [Rule Names](#rule-names) - - [Expressions](#expressions) - - [Literals](#literals) - - [Special Cases](#special-cases) - - [Groups](#groups) - - [Ranges](#ranges) - - [Operators](#operators) - - [Concatenation](#concatenation) - - [Alternation](#alternation) - - [Exclusion](#exclusion) + - [Expressions](#expressions) + - [Literals](#literals) + - [Special Cases](#special-cases) + - [Groups](#groups) + - [Ranges](#ranges) + - [Concatenation](#concatenation) + - [Alternation](#alternation) + - [Exclusion](#exclusion) + - ["One or more"](#one-or-more) ## Features @@ -48,6 +48,8 @@ This extension adds support for an EBNF-like syntax ([Extended Backus-Naur Form] This extension implements a simple and strict-ish version of EBNF. The syntax is defined in itself in [ebnf.ebnf](./ebnf.ebnf). +The dialect implemented mostly follows the [ISO/IEC 14977](https://www.iso.org/standard/81671.html) standard, with some extensions for clarity and convenience. + ## Comments Comments are defined using the `(*` and `*)` delimiters. @@ -60,15 +62,15 @@ Rules are defined using the assignment operator `=`. The left-hand side is the r Rule names can start with any letter, number, or an underscore. They can also contain a hyphen, but not at the beginning. Rule names are case-sensitive. -### Expressions +## Expressions Expressions are made up of _terms_ and _operators_. Terms are either literals, references to other rules (by name), special cases, groups, or ranges. Operators are used to combine terms into more complex expressions. -#### Literals +### Literals Literals are enclosed in single quotes or double quotes. They can contain any character except for the quote character used to enclose them. No escaping is considered, so you can't use a single quote inside a single-quoted literal, or a double quote inside a double-quoted literal. How to interpret sequences like `\n` is up to the reader. Both literals and special cases can be multiline. -#### Special Cases +### Special Cases Special cases are used to describe content that cannot be easily expressed using the other terms. They are enclosed in question marks `?`, and can have multiple lines. @@ -77,7 +79,7 @@ Special cases are used to describe content that cannot be easily expressed using ? valid UTF-8 ? ``` -#### Groups +### Groups There are three different types of groups: @@ -85,15 +87,13 @@ There are three different types of groups: - Brackets (_optional_) indicate that the content inside is optional, i.e. it can appear zero or one times. - Braces (_repetition_) indicate that the content inside can appear zero or more times. -#### Ranges +### Ranges Ranges are used to define a set a contiguous characters. They are composed of two strings joined by two dots `..`. Ranges have no specific definition of what a range "is". It should be obvious what the range should represent. For example, a range of `"A".."Z"` is probably a set of uppercase letters, while a range of `"0".."9"` is probably a set of digits. -### Operators - -#### Concatenation +### Concatenation Concatenation can be defined using the comma `,` operator between terms or by juxtaposition of terms. @@ -104,7 +104,7 @@ It does not define what whitespace is allowed between terms; it is assumed that "fn" name "()" (* probably "fn foo()" *) ``` -#### Alternation +### Alternation The alternation operator is the pipe `|`. It is used to define a set of possible choices for a term. @@ -113,7 +113,7 @@ The alternation operator is the pipe `|`. It is used to define a set of possible "A", ( "B" | "C" ) (* "AB" or "AC" *) ``` -#### Exclusion +### Exclusion The exclusion operator is the caret `-`. It is used to define a set of possible choices for a term, but excludes one or more of them. @@ -121,3 +121,28 @@ The exclusion operator is the caret `-`. It is used to define a set of possible letter = "A".."Z" ; not_z = letter - "Z" ; (* "A".."Y" *) ``` + +### "One or more" + +The postfix operators `+` and `-` modify the preceding term to indicate that it occurs "one or more" times. The following forms are equivalent: + +```ebnf +many-as = { "a" }+ ; (* "a", "aa", "aaa", ... but not "" *) +many-as = { "a" }- ; +many-as = { "a" } - '' ; +``` + +> [!NOTE] +> +> The `-` operator is also valid as an infix oerator (see +> [Exclusion](#exclusion)). Thus, when another term follows a unary `-`, it will +> be interpreted as an exclusion instead of a concatenation. Adding a comma +> directly after a unary `-` can be used to disambiguate this case, but can +> be confusing and error-prone: +> +> ```ebnf +> ooof = { "o" }-, "f" ; +> ``` +> +> Usage of `-` as a postfix operator is therefore discouraged. Using `+`, although +> not part of ISO/IEC 14977, is recommended instead. diff --git a/assets/screenshot.png b/assets/screenshot.png index f117c8b..8c8bb64 100644 Binary files a/assets/screenshot.png and b/assets/screenshot.png differ diff --git a/ebnf.ebnf b/ebnf.ebnf index f5dab72..f39b530 100644 --- a/ebnf.ebnf +++ b/ebnf.ebnf @@ -15,7 +15,8 @@ digit = "0".."9" ; expression = name | literal | special | group | repetition | optional - | alternation | concatenation | range | except ; + | alternation | concatenation | range | except + | one-or-more ; literal = ( '"', ? any string ? - '"', '"' ) | ( "'", ? any string ? - "'", "'" ) ; special = "?", ? any string ? - "?", "?" ; @@ -27,3 +28,4 @@ alternation = expression, "|", expression ; concatenation = expression, [","], expression ; range = expression, "..", expression ; except = expression, "-", expression ; +one-or-more = ( expression, "+" ) | ( expression, "-" ) ; diff --git a/language/ast.ts b/language/ast.ts index 538473e..be45c0d 100644 --- a/language/ast.ts +++ b/language/ast.ts @@ -79,6 +79,21 @@ export class Optional extends Expr { } } +export class UnaryExpr extends Expr { + constructor( + pos: number, + public readonly expr: Expr, + public readonly op: Token, + public readonly opPos: number + ) { + super(pos, expr.end) + } + + get children(): Expr[] { + return [this.expr] + } +} + export class BinaryExpr extends Expr { constructor( pos: number, diff --git a/language/parser.ts b/language/parser.ts index 5ef047c..99c2546 100644 --- a/language/parser.ts +++ b/language/parser.ts @@ -3,10 +3,12 @@ import { File, FileSet } from './file.js' import { Scanner } from './scanner.js' import { Token, + canStartExpression, + infixPrecedence, isLiteral, isOperator, lowestPrecedence, - precedenceOf, + postfixPrecedence, toString, } from './token.js' @@ -149,49 +151,56 @@ export class Parser { } @trace('Expr') - parseExpr(): ast.Expr { - return this.parseBinaryExpr(lowestPrecedence + 1) - } - - @trace('BinaryExpr') - parseBinaryExpr(precedence: number, x?: ast.Expr): ast.Expr { - if (!x) { - x = this.parseOperand() + parseExpr( + minPrecedence: number = lowestPrecedence + 1, + lhs?: ast.Expr + ): ast.Expr { + if (!lhs) { + lhs = this.parseOperand() } while (true) { let operator = this.tok let opPos: number | undefined - let y: ast.Expr - if (!isOperator(operator)) { - switch (operator) { - case Token.Ident: - case Token.String: - case Token.Special: - case Token.LParen: - case Token.LBrace: - case Token.LBracket: - // potential start of a new expression - // try parsing as implicit concatenation - operator = Token.Concatenate - opPos = -1 - break - default: - // fallback to regular binary expression - break - } + let rhs: ast.Expr + + if (canStartExpression(operator)) { + // potential start of a new expression + // try parsing as implicit concatenation + operator = Token.Concatenate + opPos = -1 } - const opPrecedence = precedenceOf(operator) - if (opPrecedence < precedence) { - return x + const postfixPrec = postfixPrecedence(operator) + if (postfixPrec >= minPrecedence) { + if (!opPos) { + // consume the operator + opPos = this.expect(operator)[0] + } + + // there is a following expression, we'll treat it as infix + if (operator == Token.Except && canStartExpression(this.tok)) { + // fall through to next case + } else { + lhs = new ast.UnaryExpr(lhs.pos, lhs, operator, opPos) + continue + } } - if (!opPos) { - ;[opPos] = this.expect(operator) + + const infixPrec = infixPrecedence(operator) + if (infixPrec >= minPrecedence) { + if (!opPos) { + // consume the operator + opPos = this.expect(operator)[0] + } + + rhs = this.parseExpr(infixPrec + 1) + lhs = new ast.BinaryExpr(lhs.pos, lhs, operator, opPos, rhs) + continue } - y = this.parseBinaryExpr(opPrecedence + 1) - x = new ast.BinaryExpr(x.pos, x, operator, opPos, y) + // nothing left to do + return lhs } } diff --git a/language/scanner.ts b/language/scanner.ts index 3bb9f9b..f5002a1 100644 --- a/language/scanner.ts +++ b/language/scanner.ts @@ -30,6 +30,7 @@ const tokenTree = tokens({ '=': Token.Assign, '|': Token.Alternate, '-': Token.Except, + '+': Token.OneOrMore, '.': tokens({ '.': Token.Range, }), diff --git a/language/token.ts b/language/token.ts index f86fe85..52ca1d7 100644 --- a/language/token.ts +++ b/language/token.ts @@ -1,11 +1,20 @@ export const enum Token { Illegal, EOF, + Comment, + expressionStart, Ident, - Comment, String, Special, + LBrace, + LParen, + LBracket, + expressionEnd, + + RBrace, + RParen, + RBracket, operatorStart, Semi, @@ -14,14 +23,8 @@ export const enum Token { Except, Range, Concatenate, + OneOrMore, operatorEnd, - - LBrace, - RBrace, - LParen, - RParen, - LBracket, - RBracket, } export const toString = (token: Token): string => { @@ -38,6 +41,7 @@ export const toString = (token: Token): string => { [Token.Except]: '-', [Token.Range]: '..', [Token.Concatenate]: ',', + [Token.OneOrMore]: '+', [Token.LBrace]: '{', [Token.RBrace]: '}', [Token.LParen]: '(', @@ -49,15 +53,19 @@ export const toString = (token: Token): string => { return tokenNames[token] || `token(${token})` } -export const isLiteral = (token: Token): boolean => { +export function isLiteral(token: Token): boolean { return token === Token.String } -export const isOperator = (token: Token): boolean => { +export function isOperator(token: Token): boolean { return token >= Token.operatorStart && token <= Token.operatorEnd } -export const precedenceOf = (token: Token): number => { +export function canStartExpression(token: Token): boolean { + return token >= Token.expressionStart && token <= Token.expressionEnd +} + +export function infixPrecedence(token: Token): number { switch (token) { case Token.Alternate: return 1 @@ -65,7 +73,18 @@ export const precedenceOf = (token: Token): number => { return 2 case Token.Except: return 3 + // postfix OneOrMore 4 case Token.Range: + return 5 + default: + return lowestPrecedence + } +} + +export function postfixPrecedence(token: Token): number { + switch (token) { + case Token.Except: // postfix as one or more + case Token.OneOrMore: return 4 default: return lowestPrecedence @@ -73,4 +92,3 @@ export const precedenceOf = (token: Token): number => { } export const lowestPrecedence = 0 -export const highestPrecedence = 4 diff --git a/test/ebnf-grammar.ts b/test/ebnf-grammar.ts index 3e17273..90c569a 100644 --- a/test/ebnf-grammar.ts +++ b/test/ebnf-grammar.ts @@ -69,6 +69,7 @@ export const ebnfGrammar = grammar([ ident('concatenation'), ident('range'), ident('except'), + ident('one-or-more'), ]) ), rule( @@ -143,4 +144,11 @@ export const ebnfGrammar = grammar([ ident('expression'), ]) ), + rule( + 'one-or-more', + alternation( + group(concatenation(ident('expression'), string('+'))), + group(concatenation(ident('expression'), string('-'))) + ) + ), ]) diff --git a/test/ebnf-no-commas.ebnf b/test/ebnf-no-commas.ebnf index b69cc4f..d5258a3 100644 --- a/test/ebnf-no-commas.ebnf +++ b/test/ebnf-no-commas.ebnf @@ -15,7 +15,8 @@ digit = "0".."9" ; expression = name | literal | special | group | repetition | optional - | alternation | concatenation | range | except ; + | alternation | concatenation | range | except + | one-or-more ; literal = ( '"', ? any string ? - '"', '"' ) | ( "'", ? any string ? - "'", "'" ) ; special = "?", ? any string ? - "?", "?" ; @@ -27,3 +28,4 @@ alternation = expression "|" expression ; concatenation = expression [ "," ] expression ; range = expression ".." expression ; except = expression "-" expression ; +one-or-more = ( expression "+" ) | ( expression "-" ) ; diff --git a/test/language.test.ts b/test/language.test.ts index d9374f6..f70ce43 100644 --- a/test/language.test.ts +++ b/test/language.test.ts @@ -4,7 +4,17 @@ import * as ast from '../language/ast' import { FileSet } from '../language/file' import { Parser } from '../language/parser' import { ebnfGrammar } from './ebnf-grammar' -import { type Matcher } from './matchers' +import { + concatenation, + except, + grammar, + ident, + oneOrMoreMinus, + oneOrMorePlus, + repetition, + rule, + type Matcher, +} from './matchers' const parseTests = [ ['ebnf.ebnf', readFileSync('ebnf.ebnf', 'utf8'), ebnfGrammar], @@ -13,6 +23,31 @@ const parseTests = [ readFileSync('test/ebnf-no-commas.ebnf', 'utf8'), ebnfGrammar, ], + [ + '', + 'foo = { bar } - baz ;', + grammar([rule('foo', except(repetition(ident('bar')), ident('baz')))]), + ], + [ + '', + 'foo = { bar }- ;', + grammar([rule('foo', oneOrMoreMinus(repetition(ident('bar'))))]), + ], + [ + '', + 'foo = { bar }-, baz ;', + grammar([ + rule( + 'foo', + concatenation(oneOrMoreMinus(repetition(ident('bar'))), ident('baz')) + ), + ]), + ], + [ + '', + 'foo = {bar}+ ;', + grammar([rule('foo', oneOrMorePlus(repetition(ident('bar'))))]), + ], ] as [string, string, Matcher][] function parse(input: string): ast.Grammar { diff --git a/test/matchers.ts b/test/matchers.ts index c6792e5..6a47a4f 100644 --- a/test/matchers.ts +++ b/test/matchers.ts @@ -228,3 +228,50 @@ export function except(leftMatcher: Matcher, rightMatcher: Matcher): Matcher { }) return f } + +export function unaryExpr(op: token.Token, exprMatcher: Matcher): Matcher { + const f = (expr: ast.Expr) => { + expect(expr).toBeInstanceOf(ast.UnaryExpr) + expect((expr as ast.UnaryExpr).op).toBe(op) + exprMatcher((expr as ast.UnaryExpr).expr) + } + Object.defineProperty(f, 'name', { + value: `matchUnaryExpr(op="${token.toString(op)}")`, + writable: true, + }) + Object.defineProperty(f, internal, { + value: { + type: 'unaryExpr', + op, + expr: $serialize(exprMatcher), + }, + writable: true, + }) + return f +} + +export function oneOrMorePlus(exprMatcher: Matcher): Matcher { + const f = unaryExpr(token.Token.OneOrMore, exprMatcher) + Object.defineProperty(f, 'name', { value: 'matchOneOrMore(plus)' }) + Object.defineProperty(f, internal, { + value: { + type: 'oneOrMore', + expr: $serialize(exprMatcher), + }, + writable: false, + }) + return f +} + +export function oneOrMoreMinus(exprMatcher: Matcher): Matcher { + const f = unaryExpr(token.Token.Except, exprMatcher) + Object.defineProperty(f, 'name', { value: 'matchOneOrMore(minus)' }) + Object.defineProperty(f, internal, { + value: { + type: 'oneOrMore', + expr: $serialize(exprMatcher), + }, + writable: false, + }) + return f +}