Merge pull request #188 from tree-sitter/token-precedence

Ensure that precedence is respected properly when used within tokens
This commit is contained in:
Max Brunsfeld 2018-07-31 12:52:14 -07:00 committed by GitHub
commit f1821bb04d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 162 additions and 5 deletions

View file

@ -0,0 +1,58 @@
==========================================
obvious tokens
==========================================
// hi
/* hi */
hi
/
"hi"
/hi/
---
(program
(comment)
(comment)
(identifier)
(slash)
(string)
(regex))
==========================================
strings starting with double slashes
==========================================
/*
The lexer matches the string content correctly even though
a comment could match all the way until the end of the line,
because the string content token has a higher precedence
than the comment token.
*/
"//one\n//two"
---
(program
(comment)
(string (escape_sequence)))
==========================================
comments that resemble regexes
==========================================
/*
The lexer matches this as a comment followed by an identifier
even though a regex token could match the entire thing, because
the comment token has a higher precedence than the regex token
*/
/* hello */ui
---
(program
(comment)
(comment)
(identifier))

View file

@ -0,0 +1,100 @@
{
"name": "precedence_on_token",
"extras": [
{"type": "SYMBOL", "name": "comment"},
{"type": "PATTERN", "value": "\\s"},
],
"rules": {
"program": {
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "string"
},
{
"type": "SYMBOL",
"name": "regex"
},
{
"type": "SYMBOL",
"name": "identifier"
},
{
"type": "SYMBOL",
"name": "slash"
}
]
}
},
"comment": {
"type": "TOKEN",
"content": {
"type": "PREC",
"value": 1,
"content": {
"type": "PATTERN",
"value": "//.*|/\\*[^*]*\\*/"
}
}
},
"string": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "\""},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "TOKEN",
"content": {
"type": "PREC",
"value": 2,
"content": {
"type": "PATTERN",
"value": "[^\"\n\\\\]+"
}
}
},
{
"type": "SYMBOL",
"name": "escape_sequence"
}
]
}
},
{"type": "STRING", "value": "\""}
]
},
"escape_sequence": {
"type": "PATTERN",
"value": "\\\\."
},
"regex": {
"type": "PATTERN",
"value": "/[^/\n]+/[a-z]*"
},
"identifier": {
"type": "PATTERN",
"value": "[a-z]\\w*"
},
"slash": {
"type": "STRING",
"value": "/"
}
}
}

View file

@ -0,0 +1 @@
This grammar shows the behavior of precedence used within a `TOKEN` rule. Tokens with higher precedence are preferred, even if they match a shorter string.

View file

@ -9,8 +9,6 @@
START_TEST
if (TREE_SITTER_SEED == -1) return;
string grammars_dir_path = join_path({"test", "fixtures", "test_grammars"});
vector<string> test_languages = list_directory(grammars_dir_path);

View file

@ -172,9 +172,9 @@ describe("Parser", [&]() {
describe("when there is an unterminated error", [&]() {
it("maintains a consistent tree", [&]() {
ts_parser_set_language(parser, load_real_language("javascript"));
set_text("a; ' this string never ends");
assert_root_node("(program (expression_statement (identifier)) (ERROR (UNEXPECTED EOF)))");
ts_parser_set_language(parser, load_real_language("json"));
set_text("nul");
assert_root_node("(ERROR (UNEXPECTED EOF))");
});
});