From 68d6e242ee25fd747bb0064a6317ff0018ae43ea Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Thu, 11 Sep 2014 13:10:23 -0700
Subject: [PATCH] Fix parsing of wildcard patterns at the ends of documents

- Remove special EOF handling from lexer
- Explicitly exclude the EOF character from all-inclusive character sets.
---
 include/tree_sitter/parser.h         |  5 +-
 spec/fixtures/grammars/arithmetic.cc |  2 +-
 spec/fixtures/parsers/arithmetic.c   |  3 +-
 spec/fixtures/parsers/golang.c       | 12 +++--
 spec/fixtures/parsers/javascript.c   | 78 ++++++++++++++++++----------
 spec/fixtures/parsers/json.c         |  9 ++--
 spec/runtime/parser_spec.cc          | 25 +++++++++
 src/compiler/rules/character_set.cc  |  3 ++
 8 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h
index 3060df93..e4da8b40 100644
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@@ -96,10 +96,7 @@ struct TSLanguage {
 #define ADVANCE(state_index)                                \
   {                                                         \
     DEBUG_LEX("ADVANCE %d", state_index);                   \
-    if (!ts_lexer_advance(lexer)) {                         \
-      DEBUG_LEX("END");                                     \
-      return ts_lexer_accept(lexer, ts_builtin_sym_end, 0); \
-    }                                                       \
+    ts_lexer_advance(lexer);                                \
     lex_state = state_index;                                \
     goto next_state;                                        \
   }
diff --git a/spec/fixtures/grammars/arithmetic.cc b/spec/fixtures/grammars/arithmetic.cc
index e8750ee1..99c7d47b 100644
--- a/spec/fixtures/grammars/arithmetic.cc
+++ b/spec/fixtures/grammars/arithmetic.cc
@@ -25,7 +25,7 @@ extern const Grammar arithmetic = Grammar({
     { "group", in_parens(err(sym("expression"))) },
 
     { "number", pattern("\\d+") },
-    { "variable", pattern("\\a[\\w_]*") },
+    { "variable", pattern("\\a[\\w]*") },
 
     { "comment", pattern("#.*") },
 }).ubiquitous_tokens({
diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c
index 0b9adfd1..76188fc7 100644
--- a/spec/fixtures/parsers/arithmetic.c
+++ b/spec/fixtures/parsers/arithmetic.c
@@ -77,7 +77,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(5);
             LEX_ERROR();
         case 2:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                 ADVANCE(2);
             ACCEPT_TOKEN(ts_sym_comment);
         case 3:
diff --git a/spec/fixtures/parsers/golang.c b/spec/fixtures/parsers/golang.c
index 14b03815..61f79e5f 100644
--- a/spec/fixtures/parsers/golang.c
+++ b/spec/fixtures/parsers/golang.c
@@ -261,7 +261,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(4);
             LEX_ERROR();
         case 4:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                 ADVANCE(4);
             ACCEPT_TOKEN(ts_sym_comment);
         case 5:
@@ -446,7 +447,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(37);
             if (lookahead == '\\')
                 ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(36);
             LEX_ERROR();
@@ -457,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(39);
             if (lookahead == '\\')
                 ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(36);
             LEX_ERROR();
@@ -466,7 +469,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(37);
             if (lookahead == '\\')
                 ADVANCE(38);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(36);
             ACCEPT_TOKEN(ts_sym_string);
diff --git a/spec/fixtures/parsers/javascript.c b/spec/fixtures/parsers/javascript.c
index 7d8990b8..4a546ac0 100644
--- a/spec/fixtures/parsers/javascript.c
+++ b/spec/fixtures/parsers/javascript.c
@@ -373,7 +373,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(6);
             if (lookahead == '\\')
                 ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(5);
             LEX_ERROR();
@@ -384,7 +385,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(8);
             if (lookahead == '\\')
                 ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(5);
             LEX_ERROR();
@@ -393,7 +395,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(6);
             if (lookahead == '\\')
                 ADVANCE(7);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(5);
             ACCEPT_TOKEN(ts_sym_string);
@@ -410,7 +413,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(6);
             if (lookahead == '\\')
                 ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                 (lookahead == '\\')))
                 ADVANCE(10);
             LEX_ERROR();
@@ -419,7 +423,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(12);
             if (lookahead == '\\')
                 ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                 (lookahead == '\\')))
                 ADVANCE(10);
             LEX_ERROR();
@@ -428,7 +433,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(6);
             if (lookahead == '\\')
                 ADVANCE(11);
-            if (!((lookahead == '\'') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\'') ||
                 (lookahead == '\\')))
                 ADVANCE(10);
             ACCEPT_TOKEN(ts_sym_string);
@@ -453,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(31);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
@@ -465,7 +472,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(25);
             if (lookahead == '\\')
                 ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(19);
@@ -475,7 +483,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(21);
             if (lookahead == '\\')
                 ADVANCE(23);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(19);
             LEX_ERROR();
@@ -492,7 +501,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(24);
             if (lookahead == '\\')
                 ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(19);
@@ -506,7 +516,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(23);
             if (lookahead == 'g')
                 ADVANCE(30);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\') ||
                 (lookahead == 'g')))
@@ -517,14 +528,16 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(26);
             if (lookahead == 'g')
                 ADVANCE(29);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == 'g')))
                 ADVANCE(28);
             ACCEPT_TOKEN(ts_sym_regex);
         case 26:
             if (lookahead == '/')
                 ADVANCE(27);
-            if (!(lookahead == '/'))
+            if (!((lookahead == 0) ||
+                (lookahead == '/')))
                 ADVANCE(28);
             LEX_ERROR();
         case 27:
@@ -532,13 +545,15 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
         case 28:
             if (lookahead == '*')
                 ADVANCE(26);
-            if (!(lookahead == '*'))
+            if (!((lookahead == 0) ||
+                (lookahead == '*')))
                 ADVANCE(28);
             LEX_ERROR();
         case 29:
             if (lookahead == '*')
                 ADVANCE(26);
-            if (!(lookahead == '*'))
+            if (!((lookahead == 0) ||
+                (lookahead == '*')))
                 ADVANCE(28);
             ACCEPT_TOKEN(ts_sym_regex);
         case 30:
@@ -548,7 +563,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(25);
             if (lookahead == '\\')
                 ADVANCE(23);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(19);
@@ -556,16 +572,19 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
         case 31:
             if (lookahead == 'g')
                 ADVANCE(32);
-            if (!((lookahead == '\n') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\n') ||
                 (lookahead == 'g')))
                 ADVANCE(33);
             ACCEPT_TOKEN(ts_sym_comment);
         case 32:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                 ADVANCE(33);
             ACCEPT_TOKEN(ts_sym_comment);
         case 33:
-            if (!(lookahead == '\n'))
+            if (!((lookahead == 0) ||
+                (lookahead == '\n')))
                 ADVANCE(33);
             ACCEPT_TOKEN(ts_sym_comment);
         case 34:
@@ -573,7 +592,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(35);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
             LEX_ERROR();
@@ -584,7 +604,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(34);
             if (lookahead == 'g')
                 ADVANCE(37);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\') ||
                 (lookahead == 'g')))
                 ADVANCE(38);
@@ -598,7 +619,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(36);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
             ACCEPT_TOKEN(ts_sym_regex);
@@ -607,7 +629,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(36);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
             LEX_ERROR();
@@ -2672,7 +2695,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(195);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '=') ||
                 (lookahead == '\\')))
@@ -2683,7 +2707,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(36);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '/') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
             ACCEPT_TOKEN(ts_aux_sym_33);
@@ -3074,7 +3099,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(31);
             if (lookahead == '\\')
                 ADVANCE(34);
-            if (!((lookahead == '*') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '*') ||
                 (lookahead == '/') ||
                 (lookahead == '\\')))
                 ADVANCE(38);
diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c
index 043e9957..ffa0faf3 100644
--- a/spec/fixtures/parsers/json.c
+++ b/spec/fixtures/parsers/json.c
@@ -85,7 +85,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(3);
             if (lookahead == '\\')
                 ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(2);
             LEX_ERROR();
@@ -96,7 +97,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(5);
             if (lookahead == '\\')
                 ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(2);
             LEX_ERROR();
@@ -105,7 +107,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
                 ADVANCE(3);
             if (lookahead == '\\')
                 ADVANCE(4);
-            if (!((lookahead == '\"') ||
+            if (!((lookahead == 0) ||
+                (lookahead == '\"') ||
                 (lookahead == '\\')))
                 ADVANCE(2);
             ACCEPT_TOKEN(ts_sym_string);
diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc
index d0ccdab8..05183b6a 100644
--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@@ -320,6 +320,31 @@ describe("Parser", [&]() {
       });
     });
   });
+
+  describe("lexing", [&]() {
+    before_each([&]() {
+      ts_document_set_language(doc, ts_language_arithmetic());
+    });
+
+    describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
+      it("terminates them at the end of the document", [&]() {
+        ts_document_set_language(doc, ts_language_arithmetic());
+
+        set_text("x # this is a comment");
+
+        AssertThat(ts_node_string(root), Equals("(DOCUMENT "
+            "(expression (variable) (comment)))"));
+
+        TSNode *expression = ts_node_child(root, 0);
+        TSNode *comment = ts_node_child(expression, 1);
+
+        AssertThat(ts_node_size(comment), Equals(strlen("# this is a comment")));
+
+        ts_node_release(expression);
+        ts_node_release(comment);
+      });
+    });
+  });
 });
 
 END_TEST
diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc
index bbf6cbc4..c3c044a2 100644
--- a/src/compiler/rules/character_set.cc
+++ b/src/compiler/rules/character_set.cc
@@ -87,6 +87,7 @@ size_t CharacterSet::hash_code() const {
   result ^= hash<size_t>()(included_chars.size());
   for (auto &c : included_chars)
     result ^= hash<uint32_t>()(c);
+  result <<= 1;
   result ^= hash<size_t>()(excluded_chars.size());
   for (auto &c : excluded_chars)
     result ^= hash<uint32_t>()(c);
@@ -118,6 +119,8 @@ string CharacterSet::to_string() const {
 
 CharacterSet &CharacterSet::include_all() {
   includes_all = true;
+  included_chars = {};
+  excluded_chars = { 0 };
   return *this;
 }