Merge pull request #188 from tree-sitter/token-precedence

Ensure that precedence is respected properly when used within tokens
2018-07-31 12:52:14 -07:00 · 2018-07-31 12:52:14 -07:00 · f1821bb04d
commit f1821bb04d
parent 9795aa16c4 714fda917a
5 changed files with 162 additions and 5 deletions
--- a/test/fixtures/test_grammars/precedence_on_token/corpus.txt
+++ b/test/fixtures/test_grammars/precedence_on_token/corpus.txt
@ -0,0 +1,58 @@
+==========================================
+obvious tokens
+==========================================
+
+// hi
+/* hi */
+hi
+/
+"hi"
+/hi/
+
+---
+
+(program
+  (comment)
+  (comment)
+  (identifier)
+  (slash)
+  (string)
+  (regex))
+
+==========================================
+strings starting with double slashes
+==========================================
+
+/*
+The lexer matches the string content correctly even though
+a comment could match all the way until the end of the line,
+because the string content token has a higher precedence
+than the comment token.
+*/
+
+"//one\n//two"
+
+---
+
+(program
+  (comment)
+  (string (escape_sequence)))
+
+==========================================
+comments that resemble regexes
+==========================================
+
+/*
+The lexer matches this as a comment followed by an identifier
+even though a regex token could match the entire thing, because
+the comment token has a higher precedence than the regex token
+*/
+
+/* hello */ui
+
+---
+
+(program
+  (comment)
+  (comment)
+  (identifier))
--- a/test/fixtures/test_grammars/precedence_on_token/grammar.json
+++ b/test/fixtures/test_grammars/precedence_on_token/grammar.json
@ -0,0 +1,100 @@
+{
+  "name": "precedence_on_token",
+
+  "extras": [
+    {"type": "SYMBOL", "name": "comment"},
+    {"type": "PATTERN", "value": "\\s"},
+  ],
+
+  "rules": {
+    "program": {
+      "type": "REPEAT",
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SYMBOL",
+            "name": "string"
+          },
+          {
+            "type": "SYMBOL",
+            "name": "regex"
+          },
+          {
+            "type": "SYMBOL",
+            "name": "identifier"
+          },
+          {
+            "type": "SYMBOL",
+            "name": "slash"
+          }
+        ]
+      }
+    },
+
+    "comment": {
+      "type": "TOKEN",
+      "content": {
+        "type": "PREC",
+        "value": 1,
+        "content": {
+          "type": "PATTERN",
+          "value": "//.*|/\\*[^*]*\\*/"
+        }
+      }
+    },
+
+    "string": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "\""},
+
+        {
+          "type": "REPEAT",
+          "content": {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "TOKEN",
+                "content": {
+                  "type": "PREC",
+                  "value": 2,
+                  "content": {
+                    "type": "PATTERN",
+                    "value": "[^\"\n\\\\]+"
+                  }
+                }
+              },
+              {
+                "type": "SYMBOL",
+                "name": "escape_sequence"
+              }
+            ]
+          }
+        },
+
+        {"type": "STRING", "value": "\""}
+      ]
+    },
+
+    "escape_sequence": {
+      "type": "PATTERN",
+      "value": "\\\\."
+    },
+
+    "regex": {
+      "type": "PATTERN",
+      "value": "/[^/\n]+/[a-z]*"
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-z]\\w*"
+    },
+
+    "slash": {
+      "type": "STRING",
+      "value": "/"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/precedence_on_token/readme.md
+++ b/test/fixtures/test_grammars/precedence_on_token/readme.md
@ -0,0 +1 @@
+This grammar shows the behavior of precedence used within a `TOKEN` rule. Tokens with higher precedence are preferred, even if they match a shorter string.
--- a/test/integration/test_grammars.cc
+++ b/test/integration/test_grammars.cc
@ -9,8 +9,6 @@

 START_TEST

-if (TREE_SITTER_SEED == -1) return;
-
 string grammars_dir_path = join_path({"test", "fixtures", "test_grammars"});
 vector<string> test_languages = list_directory(grammars_dir_path);

--- a/test/runtime/parser_test.cc
+++ b/test/runtime/parser_test.cc
@ -172,9 +172,9 @@ describe("Parser", [&]() {

    describe("when there is an unterminated error", [&]() {
      it("maintains a consistent tree", [&]() {
-        ts_parser_set_language(parser, load_real_language("javascript"));
-        set_text("a; ' this string never ends");
-        assert_root_node("(program (expression_statement (identifier)) (ERROR (UNEXPECTED EOF)))");
+        ts_parser_set_language(parser, load_real_language("json"));
+        set_text("nul");
+        assert_root_node("(ERROR (UNEXPECTED EOF))");
      });
    });
				`@ -0,0 +1 @@`
				This grammar shows the behavior of precedence used within a `TOKEN` rule. Tokens with higher precedence are preferred, even if they match a shorter string.