From 68e707eb4f3b478e50b9a85cb1e1086cd893e6c4 Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Mon, 6 Jan 2025 01:54:38 -0500 Subject: [PATCH] feat: support passing in a Rust regex in the grammar dsl --- cli/generate/src/dsl.js | 12 ++++++++++++ cli/npm/dsl.d.ts | 10 +++++++++- docs/src/creating-parsers/2-the-grammar-dsl.md | 15 +++++++++++++-- .../test_grammars/readme_grammar/grammar.js | 2 +- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/cli/generate/src/dsl.js b/cli/generate/src/dsl.js index 77f0366f..dd59efa6 100644 --- a/cli/generate/src/dsl.js +++ b/cli/generate/src/dsl.js @@ -211,6 +211,11 @@ function normalize(value) { type: 'PATTERN', value: value.source }; + case RustRegex: + return { + type: 'PATTERN', + value: value.value + }; case ReferenceError: throw value default: @@ -483,6 +488,12 @@ function grammar(baseGrammar, options) { }; } +class RustRegex { + constructor(value) { + this.value = value; + } +} + function checkArguments(args, ruleCount, caller, callerName, suffix = '', argType = 'rule') { // Allow for .map() usage where additional arguments are index and the entire array. const isMapCall = ruleCount === 3 && typeof args[1] === 'number' && Array.isArray(args[2]); @@ -524,6 +535,7 @@ globalThis.sym = sym; globalThis.token = token; globalThis.grammar = grammar; globalThis.field = field; +globalThis.RustRegex = RustRegex; const result = await import(getEnv("TREE_SITTER_GRAMMAR_PATH")); const object = { diff --git a/cli/npm/dsl.d.ts b/cli/npm/dsl.d.ts index 3deda087..9ad40905 100644 --- a/cli/npm/dsl.d.ts +++ b/cli/npm/dsl.d.ts @@ -33,7 +33,15 @@ type Rule = | SymbolRule | TokenRule; -type RuleOrLiteral = Rule | RegExp | string; +class RustRegex { + value: string; + + constructor(pattern: string) { + this.value = pattern; + } +} + +type RuleOrLiteral = Rule | RegExp | RustRegex | string; type GrammarSymbols = { [name in RuleName]: SymbolRule; diff --git a/docs/src/creating-parsers/2-the-grammar-dsl.md b/docs/src/creating-parsers/2-the-grammar-dsl.md index 095a425c..bc11c670 100644 --- a/docs/src/creating-parsers/2-the-grammar-dsl.md +++ b/docs/src/creating-parsers/2-the-grammar-dsl.md @@ -8,8 +8,18 @@ called `$`. The syntax `$.identifier` is how you refer to another grammar symbol or `$.UNEXPECTED` should be avoided as they have special meaning for the `tree-sitter test` command. - **String and Regex literals** — The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; -it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing -regular expressions in your grammar. +it generates its own regex-matching logic based on the Rust regex syntax as part of each parser. Regex literals are just +used as a convenient way of writing regular expressions in your grammar. You can use Rust regular expressions in your grammar +DSL through the `RustRegex` class. Simply pass your regex pattern as a string: + +```js +new RustRegex('(?i)[a-z_][a-z0-9_]*') // matches a simple identifier +``` + +Unlike JavaScript's builtin `RegExp` class, which takes a pattern and flags as separate arguments, `RustRegex` only +accepts a single pattern string. While it doesn't support separate flags, you can use inline flags within the pattern itself. +For more details about Rust's regex syntax and capabilities, check out the [Rust regex documentation][rust regex]. + - **Regex Limitations** — Only a subset of the Regex engine is actually supported. This is due to certain features like lookahead and lookaround assertions not feasible to use in an LR(1) grammar, as well as certain flags being unnecessary @@ -128,5 +138,6 @@ object that coreesponds an empty array, signifying *no* keywords are reserved. [keyword-extraction]: ./3-writing-the-grammar.md#keyword-extraction [lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables [named-vs-anonymous-nodes]: ../using-parsers/2-basic-parsing.md#named-vs-anonymous-nodes +[rust regex]: https://docs.rs/regex/1.1.8/regex/#grouping-and-flags [static-node-types]: ../using-parsers/6-static-node-types.md [yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html diff --git a/test/fixtures/test_grammars/readme_grammar/grammar.js b/test/fixtures/test_grammars/readme_grammar/grammar.js index 9f3ce6dd..a24878df 100644 --- a/test/fixtures/test_grammars/readme_grammar/grammar.js +++ b/test/fixtures/test_grammars/readme_grammar/grammar.js @@ -31,6 +31,6 @@ module.exports = grammar({ comment: _ => /#.*/, - variable: _ => /[a-zA-Z]\w*/, + variable: _ => new RustRegex('(?i:[a-z])\\w*'), }, });