From 68d6e242ee25fd747bb0064a6317ff0018ae43ea Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 11 Sep 2014 13:10:23 -0700 Subject: [PATCH] Fix parsing of wildcard patterns at the ends of documents - Remove special EOF handling from lexer - Explicitly exclude the EOF character from all-inclusive character sets. --- include/tree_sitter/parser.h | 5 +- spec/fixtures/grammars/arithmetic.cc | 2 +- spec/fixtures/parsers/arithmetic.c | 3 +- spec/fixtures/parsers/golang.c | 12 +++-- spec/fixtures/parsers/javascript.c | 78 ++++++++++++++++++---------- spec/fixtures/parsers/json.c | 9 ++-- spec/runtime/parser_spec.cc | 25 +++++++++ src/compiler/rules/character_set.cc | 3 ++ 8 files changed, 98 insertions(+), 39 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 3060df93..e4da8b40 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -96,10 +96,7 @@ struct TSLanguage { #define ADVANCE(state_index) \ { \ DEBUG_LEX("ADVANCE %d", state_index); \ - if (!ts_lexer_advance(lexer)) { \ - DEBUG_LEX("END"); \ - return ts_lexer_accept(lexer, ts_builtin_sym_end, 0); \ - } \ + ts_lexer_advance(lexer); \ lex_state = state_index; \ goto next_state; \ } diff --git a/spec/fixtures/grammars/arithmetic.cc b/spec/fixtures/grammars/arithmetic.cc index e8750ee1..99c7d47b 100644 --- a/spec/fixtures/grammars/arithmetic.cc +++ b/spec/fixtures/grammars/arithmetic.cc @@ -25,7 +25,7 @@ extern const Grammar arithmetic = Grammar({ { "group", in_parens(err(sym("expression"))) }, { "number", pattern("\\d+") }, - { "variable", pattern("\\a[\\w_]*") }, + { "variable", pattern("\\a[\\w]*") }, { "comment", pattern("#.*") }, }).ubiquitous_tokens({ diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 0b9adfd1..76188fc7 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -77,7 +77,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(5); LEX_ERROR(); case 2: - if (!(lookahead == '\n')) + if (!((lookahead == 0) || + (lookahead == '\n'))) ADVANCE(2); ACCEPT_TOKEN(ts_sym_comment); case 3: diff --git a/spec/fixtures/parsers/golang.c b/spec/fixtures/parsers/golang.c index 14b03815..61f79e5f 100644 --- a/spec/fixtures/parsers/golang.c +++ b/spec/fixtures/parsers/golang.c @@ -261,7 +261,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(4); LEX_ERROR(); case 4: - if (!(lookahead == '\n')) + if (!((lookahead == 0) || + (lookahead == '\n'))) ADVANCE(4); ACCEPT_TOKEN(ts_sym_comment); case 5: @@ -446,7 +447,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(37); if (lookahead == '\\') ADVANCE(38); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(36); LEX_ERROR(); @@ -457,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(39); if (lookahead == '\\') ADVANCE(38); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(36); LEX_ERROR(); @@ -466,7 +469,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(37); if (lookahead == '\\') ADVANCE(38); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(36); ACCEPT_TOKEN(ts_sym_string); diff --git a/spec/fixtures/parsers/javascript.c b/spec/fixtures/parsers/javascript.c index 7d8990b8..4a546ac0 100644 --- a/spec/fixtures/parsers/javascript.c +++ b/spec/fixtures/parsers/javascript.c @@ -373,7 +373,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(6); if (lookahead == '\\') ADVANCE(7); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(5); LEX_ERROR(); @@ -384,7 +385,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(8); if (lookahead == '\\') ADVANCE(7); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(5); LEX_ERROR(); @@ -393,7 +395,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(6); if (lookahead == '\\') ADVANCE(7); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(5); ACCEPT_TOKEN(ts_sym_string); @@ -410,7 +413,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(6); if (lookahead == '\\') ADVANCE(11); - if (!((lookahead == '\'') || + if (!((lookahead == 0) || + (lookahead == '\'') || (lookahead == '\\'))) ADVANCE(10); LEX_ERROR(); @@ -419,7 +423,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(12); if (lookahead == '\\') ADVANCE(11); - if (!((lookahead == '\'') || + if (!((lookahead == 0) || + (lookahead == '\'') || (lookahead == '\\'))) ADVANCE(10); LEX_ERROR(); @@ -428,7 +433,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(6); if (lookahead == '\\') ADVANCE(11); - if (!((lookahead == '\'') || + if (!((lookahead == 0) || + (lookahead == '\'') || (lookahead == '\\'))) ADVANCE(10); ACCEPT_TOKEN(ts_sym_string); @@ -453,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(31); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); @@ -465,7 +472,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(25); if (lookahead == '\\') ADVANCE(23); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\'))) ADVANCE(19); @@ -475,7 +483,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(21); if (lookahead == '\\') ADVANCE(23); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\'))) ADVANCE(19); LEX_ERROR(); @@ -492,7 +501,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(24); if (lookahead == '\\') ADVANCE(23); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\'))) ADVANCE(19); @@ -506,7 +516,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(23); if (lookahead == 'g') ADVANCE(30); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\') || (lookahead == 'g'))) @@ -517,14 +528,16 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(26); if (lookahead == 'g') ADVANCE(29); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == 'g'))) ADVANCE(28); ACCEPT_TOKEN(ts_sym_regex); case 26: if (lookahead == '/') ADVANCE(27); - if (!(lookahead == '/')) + if (!((lookahead == 0) || + (lookahead == '/'))) ADVANCE(28); LEX_ERROR(); case 27: @@ -532,13 +545,15 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { case 28: if (lookahead == '*') ADVANCE(26); - if (!(lookahead == '*')) + if (!((lookahead == 0) || + (lookahead == '*'))) ADVANCE(28); LEX_ERROR(); case 29: if (lookahead == '*') ADVANCE(26); - if (!(lookahead == '*')) + if (!((lookahead == 0) || + (lookahead == '*'))) ADVANCE(28); ACCEPT_TOKEN(ts_sym_regex); case 30: @@ -548,7 +563,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(25); if (lookahead == '\\') ADVANCE(23); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\'))) ADVANCE(19); @@ -556,16 +572,19 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { case 31: if (lookahead == 'g') ADVANCE(32); - if (!((lookahead == '\n') || + if (!((lookahead == 0) || + (lookahead == '\n') || (lookahead == 'g'))) ADVANCE(33); ACCEPT_TOKEN(ts_sym_comment); case 32: - if (!(lookahead == '\n')) + if (!((lookahead == 0) || + (lookahead == '\n'))) ADVANCE(33); ACCEPT_TOKEN(ts_sym_comment); case 33: - if (!(lookahead == '\n')) + if (!((lookahead == 0) || + (lookahead == '\n'))) ADVANCE(33); ACCEPT_TOKEN(ts_sym_comment); case 34: @@ -573,7 +592,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(35); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); LEX_ERROR(); @@ -584,7 +604,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(34); if (lookahead == 'g') ADVANCE(37); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\') || (lookahead == 'g'))) ADVANCE(38); @@ -598,7 +619,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(36); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); ACCEPT_TOKEN(ts_sym_regex); @@ -607,7 +629,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(36); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); LEX_ERROR(); @@ -2672,7 +2695,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(195); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '=') || (lookahead == '\\'))) @@ -2683,7 +2707,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(36); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '/') || + if (!((lookahead == 0) || + (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); ACCEPT_TOKEN(ts_aux_sym_33); @@ -3074,7 +3099,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(31); if (lookahead == '\\') ADVANCE(34); - if (!((lookahead == '*') || + if (!((lookahead == 0) || + (lookahead == '*') || (lookahead == '/') || (lookahead == '\\'))) ADVANCE(38); diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index 043e9957..ffa0faf3 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -85,7 +85,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(3); if (lookahead == '\\') ADVANCE(4); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(2); LEX_ERROR(); @@ -96,7 +97,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(5); if (lookahead == '\\') ADVANCE(4); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(2); LEX_ERROR(); @@ -105,7 +107,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { ADVANCE(3); if (lookahead == '\\') ADVANCE(4); - if (!((lookahead == '\"') || + if (!((lookahead == 0) || + (lookahead == '\"') || (lookahead == '\\'))) ADVANCE(2); ACCEPT_TOKEN(ts_sym_string); diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index d0ccdab8..05183b6a 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -320,6 +320,31 @@ describe("Parser", [&]() { }); }); }); + + describe("lexing", [&]() { + before_each([&]() { + ts_document_set_language(doc, ts_language_arithmetic()); + }); + + describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { + it("terminates them at the end of the document", [&]() { + ts_document_set_language(doc, ts_language_arithmetic()); + + set_text("x # this is a comment"); + + AssertThat(ts_node_string(root), Equals("(DOCUMENT " + "(expression (variable) (comment)))")); + + TSNode *expression = ts_node_child(root, 0); + TSNode *comment = ts_node_child(expression, 1); + + AssertThat(ts_node_size(comment), Equals(strlen("# this is a comment"))); + + ts_node_release(expression); + ts_node_release(comment); + }); + }); + }); }); END_TEST diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index bbf6cbc4..c3c044a2 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -87,6 +87,7 @@ size_t CharacterSet::hash_code() const { result ^= hash()(included_chars.size()); for (auto &c : included_chars) result ^= hash()(c); + result <<= 1; result ^= hash()(excluded_chars.size()); for (auto &c : excluded_chars) result ^= hash()(c); @@ -118,6 +119,8 @@ string CharacterSet::to_string() const { CharacterSet &CharacterSet::include_all() { includes_all = true; + included_chars = {}; + excluded_chars = { 0 }; return *this; }