feat: support passing in a Rust regex in the grammar dsl

This commit is contained in:
Amaan Qureshi 2025-01-06 01:54:38 -05:00
parent 4170f71dbc
commit 68e707eb4f
4 changed files with 35 additions and 4 deletions

View file

@ -211,6 +211,11 @@ function normalize(value) {
type: 'PATTERN',
value: value.source
};
case RustRegex:
return {
type: 'PATTERN',
value: value.value
};
case ReferenceError:
throw value
default:
@ -483,6 +488,12 @@ function grammar(baseGrammar, options) {
};
}
class RustRegex {
constructor(value) {
this.value = value;
}
}
function checkArguments(args, ruleCount, caller, callerName, suffix = '', argType = 'rule') {
// Allow for .map() usage where additional arguments are index and the entire array.
const isMapCall = ruleCount === 3 && typeof args[1] === 'number' && Array.isArray(args[2]);
@ -524,6 +535,7 @@ globalThis.sym = sym;
globalThis.token = token;
globalThis.grammar = grammar;
globalThis.field = field;
globalThis.RustRegex = RustRegex;
const result = await import(getEnv("TREE_SITTER_GRAMMAR_PATH"));
const object = {

10
cli/npm/dsl.d.ts vendored
View file

@ -33,7 +33,15 @@ type Rule =
| SymbolRule<string>
| TokenRule;
type RuleOrLiteral = Rule | RegExp | string;
class RustRegex {
value: string;
constructor(pattern: string) {
this.value = pattern;
}
}
type RuleOrLiteral = Rule | RegExp | RustRegex | string;
type GrammarSymbols<RuleName extends string> = {
[name in RuleName]: SymbolRule<name>;

View file

@ -8,8 +8,18 @@ called `$`. The syntax `$.identifier` is how you refer to another grammar symbol
or `$.UNEXPECTED` should be avoided as they have special meaning for the `tree-sitter test` command.
- **String and Regex literals** — The terminal symbols in a grammar are described using JavaScript strings and regular
expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes;
it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing
regular expressions in your grammar.
it generates its own regex-matching logic based on the Rust regex syntax as part of each parser. Regex literals are just
used as a convenient way of writing regular expressions in your grammar. You can use Rust regular expressions in your grammar
DSL through the `RustRegex` class. Simply pass your regex pattern as a string:
```js
new RustRegex('(?i)[a-z_][a-z0-9_]*') // matches a simple identifier
```
Unlike JavaScript's builtin `RegExp` class, which takes a pattern and flags as separate arguments, `RustRegex` only
accepts a single pattern string. While it doesn't support separate flags, you can use inline flags within the pattern itself.
For more details about Rust's regex syntax and capabilities, check out the [Rust regex documentation][rust regex].
- **Regex Limitations** — Only a subset of the Regex engine is actually
supported. This is due to certain features like lookahead and lookaround assertions
not feasible to use in an LR(1) grammar, as well as certain flags being unnecessary
@ -128,5 +138,6 @@ object that coreesponds an empty array, signifying *no* keywords are reserved.
[keyword-extraction]: ./3-writing-the-grammar.md#keyword-extraction
[lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables
[named-vs-anonymous-nodes]: ../using-parsers/2-basic-parsing.md#named-vs-anonymous-nodes
[rust regex]: https://docs.rs/regex/1.1.8/regex/#grouping-and-flags
[static-node-types]: ../using-parsers/6-static-node-types.md
[yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html

View file

@ -31,6 +31,6 @@ module.exports = grammar({
comment: _ => /#.*/,
variable: _ => /[a-zA-Z]\w*/,
variable: _ => new RustRegex('(?i:[a-z])\\w*'),
},
});