From 23e4596ec1d7de813fe88d05eca072d0f8249dfc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 26 Jul 2018 17:06:09 -0700 Subject: [PATCH 1/3] Add test for handling of precedence within tokens --- .../precedence_on_token/corpus.txt | 22 +++++++ .../precedence_on_token/grammar.json | 61 +++++++++++++++++++ test/integration/test_grammars.cc | 2 - 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 test/fixtures/test_grammars/precedence_on_token/corpus.txt create mode 100644 test/fixtures/test_grammars/precedence_on_token/grammar.json diff --git a/test/fixtures/test_grammars/precedence_on_token/corpus.txt b/test/fixtures/test_grammars/precedence_on_token/corpus.txt new file mode 100644 index 00000000..8c8fd541 --- /dev/null +++ b/test/fixtures/test_grammars/precedence_on_token/corpus.txt @@ -0,0 +1,22 @@ +========================================== +simple strings +========================================== + +"hi" + +--- + +(program (string)) + +========================================== +strings starting with double slashes +========================================== + +// comment +"//not \t a \t comment" + +--- + +(program + (comment) + (string (escape_sequence) (escape_sequence))) diff --git a/test/fixtures/test_grammars/precedence_on_token/grammar.json b/test/fixtures/test_grammars/precedence_on_token/grammar.json new file mode 100644 index 00000000..d9557add --- /dev/null +++ b/test/fixtures/test_grammars/precedence_on_token/grammar.json @@ -0,0 +1,61 @@ +{ + "name": "precedence_on_token", + + "extras": [ + {"type": "SYMBOL", "name": "comment"}, + {"type": "PATTERN", "value": "\\s"}, + ], + + "rules": { + "program": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "string" + } + }, + + "comment": { + "type": "PATTERN", + "value": "//.*" + }, + + "string": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "\""}, + + { + "type": "REPEAT", + "content": { + "type": "CHOICE", + "members": [ + { + "type": "TOKEN", + "content": { + "type": "PREC", + "value": 1, + "content": { + "type": "PATTERN", + "value": "[^\"\n\\\\]+" + } + } + }, + { + "type": "SYMBOL", + "name": "escape_sequence" + } + ] + } + }, + + {"type": "STRING", "value": "\""} + ] + }, + + "escape_sequence": { + "type": "PATTERN", + "value": "\\\\." + } + } +} diff --git a/test/integration/test_grammars.cc b/test/integration/test_grammars.cc index 3741a3c9..d10523ae 100644 --- a/test/integration/test_grammars.cc +++ b/test/integration/test_grammars.cc @@ -9,8 +9,6 @@ START_TEST -if (TREE_SITTER_SEED == -1) return; - string grammars_dir_path = join_path({"test", "fixtures", "test_grammars"}); vector test_languages = list_directory(grammars_dir_path); From 6ebb9195b11e118e738e5529b4b894004a42d501 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jul 2018 10:18:49 -0700 Subject: [PATCH 2/3] Flesh out integration test for precedence within tokens --- .../precedence_on_token/corpus.txt | 62 +++++++++++++++---- .../precedence_on_token/grammar.json | 49 +++++++++++++-- .../precedence_on_token/readme.md | 1 + 3 files changed, 94 insertions(+), 18 deletions(-) create mode 100644 test/fixtures/test_grammars/precedence_on_token/readme.md diff --git a/test/fixtures/test_grammars/precedence_on_token/corpus.txt b/test/fixtures/test_grammars/precedence_on_token/corpus.txt index 8c8fd541..a37440bb 100644 --- a/test/fixtures/test_grammars/precedence_on_token/corpus.txt +++ b/test/fixtures/test_grammars/precedence_on_token/corpus.txt @@ -1,22 +1,58 @@ ========================================== -simple strings +obvious tokens ========================================== +// hi +/* hi */ +hi +/ "hi" - ---- - -(program (string)) - -========================================== -strings starting with double slashes -========================================== - -// comment -"//not \t a \t comment" +/hi/ --- (program (comment) - (string (escape_sequence) (escape_sequence))) + (comment) + (identifier) + (slash) + (string) + (regex)) + +========================================== +strings starting with double slashes +========================================== + +/* +The lexer matches the string content correctly even though +a comment could match all the way until the end of the line, +because the string content token has a higher precedence +than the comment token. +*/ + +"//one\n//two" + +--- + +(program + (comment) + (string (escape_sequence))) + +========================================== +comments that resemble regexes +========================================== + +/* +The lexer matches this as a comment followed by an identifier +even though a regex token could match the entire thing, because +the comment token has a higher precedence than the regex token +*/ + +/* hello */ui + +--- + +(program + (comment) + (comment) + (identifier)) diff --git a/test/fixtures/test_grammars/precedence_on_token/grammar.json b/test/fixtures/test_grammars/precedence_on_token/grammar.json index d9557add..1b1ef7ea 100644 --- a/test/fixtures/test_grammars/precedence_on_token/grammar.json +++ b/test/fixtures/test_grammars/precedence_on_token/grammar.json @@ -10,14 +10,38 @@ "program": { "type": "REPEAT", "content": { - "type": "SYMBOL", - "name": "string" + "type": "CHOICE", + "members": [ + { + "type": "SYMBOL", + "name": "string" + }, + { + "type": "SYMBOL", + "name": "regex" + }, + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "SYMBOL", + "name": "slash" + } + ] } }, "comment": { - "type": "PATTERN", - "value": "//.*" + "type": "TOKEN", + "content": { + "type": "PREC", + "value": 1, + "content": { + "type": "PATTERN", + "value": "//.*|/\\*[^*]*\\*/" + } + } }, "string": { @@ -34,7 +58,7 @@ "type": "TOKEN", "content": { "type": "PREC", - "value": 1, + "value": 2, "content": { "type": "PATTERN", "value": "[^\"\n\\\\]+" @@ -56,6 +80,21 @@ "escape_sequence": { "type": "PATTERN", "value": "\\\\." + }, + + "regex": { + "type": "PATTERN", + "value": "/[^/\n]+/[a-z]*" + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-z]\\w*" + }, + + "slash": { + "type": "STRING", + "value": "/" } } } diff --git a/test/fixtures/test_grammars/precedence_on_token/readme.md b/test/fixtures/test_grammars/precedence_on_token/readme.md new file mode 100644 index 00000000..354d70e7 --- /dev/null +++ b/test/fixtures/test_grammars/precedence_on_token/readme.md @@ -0,0 +1 @@ +This grammar shows the behavior of precedence used within a `TOKEN` rule. Tokens with higher precedence are preferred, even if they match a shorter string. From 714fda917a00a16e50b17a85074114aace745070 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 31 Jul 2018 11:50:09 -0700 Subject: [PATCH 3/3] Update test now that JS strings are parsed differently --- test/runtime/parser_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 5bd3fb2d..9a354252 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -172,9 +172,9 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("a; ' this string never ends"); - assert_root_node("(program (expression_statement (identifier)) (ERROR (UNEXPECTED EOF)))"); + ts_parser_set_language(parser, load_real_language("json")); + set_text("nul"); + assert_root_node("(ERROR (UNEXPECTED EOF))"); }); });