Fix parsing of wildcard patterns at the ends of documents

- Remove special EOF handling from lexer - Explicitly exclude the EOF character from all-inclusive character sets.
2014-09-11 13:10:23 -07:00 · 2014-09-11 13:10:23 -07:00 · 68d6e242ee
commit 68d6e242ee
parent a2b80098b2
8 changed files with 98 additions and 39 deletions
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -96,10 +96,7 @@ struct TSLanguage {
 #define ADVANCE(state_index)                                \
  {                                                         \
    DEBUG_LEX("ADVANCE %d", state_index);                   \
-    if (!ts_lexer_advance(lexer)) {                         \
-      DEBUG_LEX("END");                                     \
-      return ts_lexer_accept(lexer, ts_builtin_sym_end, 0); \
-    }                                                       \
+    ts_lexer_advance(lexer);                                \
    lex_state = state_index;                                \
    goto next_state;                                        \
  }
--- a/spec/fixtures/grammars/arithmetic.cc
+++ b/spec/fixtures/grammars/arithmetic.cc
@ -25,7 +25,7 @@ extern const Grammar arithmetic = Grammar({
    { "group", in_parens(err(sym("expression"))) },

    { "number", pattern("\\d+") },
-    { "variable", pattern("\\a[\\w_]*") },
+    { "variable", pattern("\\a[\\w]*") },

    { "comment", pattern("#.*") },
 }).ubiquitous_tokens({
--- a/spec/fixtures/parsers/arithmetic.c
+++ b/spec/fixtures/parsers/arithmetic.c
@ -77,7 +77,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(5);
            LEX_ERROR();
        case 2:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                ADVANCE(2);
            ACCEPT_TOKEN(ts_sym_comment);
        case 3:
--- a/spec/fixtures/parsers/golang.c
+++ b/spec/fixtures/parsers/golang.c
@ -261,7 +261,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(4);
            LEX_ERROR();
        case 4:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                ADVANCE(4);
            ACCEPT_TOKEN(ts_sym_comment);
        case 5:
@ -446,7 +447,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(37);
            if (lookahead == '\\')
                ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(36);
            LEX_ERROR();
@ -457,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(39);
            if (lookahead == '\\')
                ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(36);
            LEX_ERROR();
@ -466,7 +469,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(37);
            if (lookahead == '\\')
                ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(36);
            ACCEPT_TOKEN(ts_sym_string);
--- a/spec/fixtures/parsers/javascript.c
+++ b/spec/fixtures/parsers/javascript.c
@ -373,7 +373,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(6);
            if (lookahead == '\\')
                ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(5);
            LEX_ERROR();
@ -384,7 +385,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(8);
            if (lookahead == '\\')
                ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(5);
            LEX_ERROR();
@ -393,7 +395,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(6);
            if (lookahead == '\\')
                ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(5);
            ACCEPT_TOKEN(ts_sym_string);
@ -410,7 +413,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(6);
            if (lookahead == '\\')
                ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                (lookahead == '\\')))
                ADVANCE(10);
            LEX_ERROR();
@ -419,7 +423,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(12);
            if (lookahead == '\\')
                ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                (lookahead == '\\')))
                ADVANCE(10);
            LEX_ERROR();
@ -428,7 +433,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(6);
            if (lookahead == '\\')
                ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                (lookahead == '\\')))
                ADVANCE(10);
            ACCEPT_TOKEN(ts_sym_string);
@ -453,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(31);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
@ -465,7 +472,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(25);
            if (lookahead == '\\')
                ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(19);
@ -475,7 +483,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(21);
            if (lookahead == '\\')
                ADVANCE(23);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(19);
            LEX_ERROR();
@ -492,7 +501,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(24);
            if (lookahead == '\\')
                ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(19);
@ -506,7 +516,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(23);
            if (lookahead == 'g')
                ADVANCE(30);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\') ||
                (lookahead == 'g')))
@ -517,14 +528,16 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(26);
            if (lookahead == 'g')
                ADVANCE(29);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == 'g')))
                ADVANCE(28);
            ACCEPT_TOKEN(ts_sym_regex);
        case 26:
            if (lookahead == '/')
                ADVANCE(27);
-            if (!(lookahead == '/'))
+            if (!((lookahead == 0) ||
+                (lookahead == '/')))
                ADVANCE(28);
            LEX_ERROR();
        case 27:
@ -532,13 +545,15 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
        case 28:
            if (lookahead == '*')
                ADVANCE(26);
-            if (!(lookahead == '*'))
+            if (!((lookahead == 0) ||
+                (lookahead == '*')))
                ADVANCE(28);
            LEX_ERROR();
        case 29:
            if (lookahead == '*')
                ADVANCE(26);
-            if (!(lookahead == '*'))
+            if (!((lookahead == 0) ||
+                (lookahead == '*')))
                ADVANCE(28);
            ACCEPT_TOKEN(ts_sym_regex);
        case 30:
@ -548,7 +563,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(25);
            if (lookahead == '\\')
                ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(19);
@ -556,16 +572,19 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
        case 31:
            if (lookahead == 'g')
                ADVANCE(32);
-            if (!((lookahead == '\n') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\n') ||
                (lookahead == 'g')))
                ADVANCE(33);
            ACCEPT_TOKEN(ts_sym_comment);
        case 32:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                ADVANCE(33);
            ACCEPT_TOKEN(ts_sym_comment);
        case 33:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                ADVANCE(33);
            ACCEPT_TOKEN(ts_sym_comment);
        case 34:
@ -573,7 +592,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(35);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
            LEX_ERROR();
@ -584,7 +604,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(34);
            if (lookahead == 'g')
                ADVANCE(37);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\') ||
                (lookahead == 'g')))
                ADVANCE(38);
@ -598,7 +619,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(36);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
            ACCEPT_TOKEN(ts_sym_regex);
@ -607,7 +629,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(36);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
            LEX_ERROR();
@ -2672,7 +2695,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(195);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '=') ||
                (lookahead == '\\')))
@ -2683,7 +2707,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(36);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
            ACCEPT_TOKEN(ts_aux_sym_33);
@ -3074,7 +3099,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(31);
            if (lookahead == '\\')
                ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                (lookahead == '/') ||
                (lookahead == '\\')))
                ADVANCE(38);
--- a/spec/fixtures/parsers/json.c
+++ b/spec/fixtures/parsers/json.c
@ -85,7 +85,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(3);
            if (lookahead == '\\')
                ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(2);
            LEX_ERROR();
@ -96,7 +97,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(5);
            if (lookahead == '\\')
                ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(2);
            LEX_ERROR();
@ -105,7 +107,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                ADVANCE(3);
            if (lookahead == '\\')
                ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(2);
            ACCEPT_TOKEN(ts_sym_string);
--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@ -320,6 +320,31 @@ describe("Parser", [&]() {
      });
    });
  });
+
+  describe("lexing", [&]() {
+    before_each([&]() {
+      ts_document_set_language(doc, ts_language_arithmetic());
+    });
+
+    describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
+      it("terminates them at the end of the document", [&]() {
+        ts_document_set_language(doc, ts_language_arithmetic());
+
+        set_text("x # this is a comment");
+
+        AssertThat(ts_node_string(root), Equals("(DOCUMENT "
+            "(expression (variable) (comment)))"));
+
+        TSNode *expression = ts_node_child(root, 0);
+        TSNode *comment = ts_node_child(expression, 1);
+
+        AssertThat(ts_node_size(comment), Equals(strlen("# this is a comment")));
+
+        ts_node_release(expression);
+        ts_node_release(comment);
+      });
+    });
+  });
 });

 END_TEST
--- a/src/compiler/rules/character_set.cc
+++ b/src/compiler/rules/character_set.cc
@ -87,6 +87,7 @@ size_t CharacterSet::hash_code() const {
  result ^= hash<size_t>()(included_chars.size());
  for (auto &c : included_chars)
    result ^= hash<uint32_t>()(c);
+  result <<= 1;
  result ^= hash<size_t>()(excluded_chars.size());
  for (auto &c : excluded_chars)
    result ^= hash<uint32_t>()(c);
@ -118,6 +119,8 @@ string CharacterSet::to_string() const {

 CharacterSet &CharacterSet::include_all() {
  includes_all = true;
+  included_chars = {};
+  excluded_chars = { 0 };
  return *this;
 }