Fix parsing of wildcard patterns at the ends of documents
- Remove special EOF handling from lexer - Explicitly exclude the EOF character from all-inclusive character sets.
This commit is contained in:
parent
a2b80098b2
commit
68d6e242ee
8 changed files with 98 additions and 39 deletions
|
|
@ -96,10 +96,7 @@ struct TSLanguage {
|
|||
#define ADVANCE(state_index) \
|
||||
{ \
|
||||
DEBUG_LEX("ADVANCE %d", state_index); \
|
||||
if (!ts_lexer_advance(lexer)) { \
|
||||
DEBUG_LEX("END"); \
|
||||
return ts_lexer_accept(lexer, ts_builtin_sym_end, 0); \
|
||||
} \
|
||||
ts_lexer_advance(lexer); \
|
||||
lex_state = state_index; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
|
|
|||
2
spec/fixtures/grammars/arithmetic.cc
vendored
2
spec/fixtures/grammars/arithmetic.cc
vendored
|
|
@ -25,7 +25,7 @@ extern const Grammar arithmetic = Grammar({
|
|||
{ "group", in_parens(err(sym("expression"))) },
|
||||
|
||||
{ "number", pattern("\\d+") },
|
||||
{ "variable", pattern("\\a[\\w_]*") },
|
||||
{ "variable", pattern("\\a[\\w]*") },
|
||||
|
||||
{ "comment", pattern("#.*") },
|
||||
}).ubiquitous_tokens({
|
||||
|
|
|
|||
3
spec/fixtures/parsers/arithmetic.c
vendored
3
spec/fixtures/parsers/arithmetic.c
vendored
|
|
@ -77,7 +77,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(5);
|
||||
LEX_ERROR();
|
||||
case 2:
|
||||
if (!(lookahead == '\n'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\n')))
|
||||
ADVANCE(2);
|
||||
ACCEPT_TOKEN(ts_sym_comment);
|
||||
case 3:
|
||||
|
|
|
|||
12
spec/fixtures/parsers/golang.c
vendored
12
spec/fixtures/parsers/golang.c
vendored
|
|
@ -261,7 +261,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(4);
|
||||
LEX_ERROR();
|
||||
case 4:
|
||||
if (!(lookahead == '\n'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\n')))
|
||||
ADVANCE(4);
|
||||
ACCEPT_TOKEN(ts_sym_comment);
|
||||
case 5:
|
||||
|
|
@ -446,7 +447,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(37);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
LEX_ERROR();
|
||||
|
|
@ -457,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(39);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
LEX_ERROR();
|
||||
|
|
@ -466,7 +469,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(37);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
|
|
|
|||
78
spec/fixtures/parsers/javascript.c
vendored
78
spec/fixtures/parsers/javascript.c
vendored
|
|
@ -373,7 +373,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(6);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(7);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(5);
|
||||
LEX_ERROR();
|
||||
|
|
@ -384,7 +385,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(8);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(7);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(5);
|
||||
LEX_ERROR();
|
||||
|
|
@ -393,7 +395,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(6);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(7);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(5);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
|
|
@ -410,7 +413,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(6);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(11);
|
||||
if (!((lookahead == '\'') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\'') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(10);
|
||||
LEX_ERROR();
|
||||
|
|
@ -419,7 +423,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(12);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(11);
|
||||
if (!((lookahead == '\'') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\'') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(10);
|
||||
LEX_ERROR();
|
||||
|
|
@ -428,7 +433,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(6);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(11);
|
||||
if (!((lookahead == '\'') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\'') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(10);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
|
|
@ -453,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(31);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
|
|
@ -465,7 +472,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(25);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(23);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(19);
|
||||
|
|
@ -475,7 +483,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(21);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(23);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(19);
|
||||
LEX_ERROR();
|
||||
|
|
@ -492,7 +501,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(24);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(23);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(19);
|
||||
|
|
@ -506,7 +516,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(23);
|
||||
if (lookahead == 'g')
|
||||
ADVANCE(30);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\') ||
|
||||
(lookahead == 'g')))
|
||||
|
|
@ -517,14 +528,16 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(26);
|
||||
if (lookahead == 'g')
|
||||
ADVANCE(29);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == 'g')))
|
||||
ADVANCE(28);
|
||||
ACCEPT_TOKEN(ts_sym_regex);
|
||||
case 26:
|
||||
if (lookahead == '/')
|
||||
ADVANCE(27);
|
||||
if (!(lookahead == '/'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/')))
|
||||
ADVANCE(28);
|
||||
LEX_ERROR();
|
||||
case 27:
|
||||
|
|
@ -532,13 +545,15 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
case 28:
|
||||
if (lookahead == '*')
|
||||
ADVANCE(26);
|
||||
if (!(lookahead == '*'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*')))
|
||||
ADVANCE(28);
|
||||
LEX_ERROR();
|
||||
case 29:
|
||||
if (lookahead == '*')
|
||||
ADVANCE(26);
|
||||
if (!(lookahead == '*'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*')))
|
||||
ADVANCE(28);
|
||||
ACCEPT_TOKEN(ts_sym_regex);
|
||||
case 30:
|
||||
|
|
@ -548,7 +563,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(25);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(23);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(19);
|
||||
|
|
@ -556,16 +572,19 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
case 31:
|
||||
if (lookahead == 'g')
|
||||
ADVANCE(32);
|
||||
if (!((lookahead == '\n') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == 'g')))
|
||||
ADVANCE(33);
|
||||
ACCEPT_TOKEN(ts_sym_comment);
|
||||
case 32:
|
||||
if (!(lookahead == '\n'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\n')))
|
||||
ADVANCE(33);
|
||||
ACCEPT_TOKEN(ts_sym_comment);
|
||||
case 33:
|
||||
if (!(lookahead == '\n'))
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\n')))
|
||||
ADVANCE(33);
|
||||
ACCEPT_TOKEN(ts_sym_comment);
|
||||
case 34:
|
||||
|
|
@ -573,7 +592,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(35);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
LEX_ERROR();
|
||||
|
|
@ -584,7 +604,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(34);
|
||||
if (lookahead == 'g')
|
||||
ADVANCE(37);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\') ||
|
||||
(lookahead == 'g')))
|
||||
ADVANCE(38);
|
||||
|
|
@ -598,7 +619,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(36);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
ACCEPT_TOKEN(ts_sym_regex);
|
||||
|
|
@ -607,7 +629,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(36);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
LEX_ERROR();
|
||||
|
|
@ -2672,7 +2695,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(195);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '=') ||
|
||||
(lookahead == '\\')))
|
||||
|
|
@ -2683,7 +2707,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(36);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '/') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
ACCEPT_TOKEN(ts_aux_sym_33);
|
||||
|
|
@ -3074,7 +3099,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(31);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(34);
|
||||
if (!((lookahead == '*') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '*') ||
|
||||
(lookahead == '/') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(38);
|
||||
|
|
|
|||
9
spec/fixtures/parsers/json.c
vendored
9
spec/fixtures/parsers/json.c
vendored
|
|
@ -85,7 +85,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(3);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
LEX_ERROR();
|
||||
|
|
@ -96,7 +97,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(5);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
LEX_ERROR();
|
||||
|
|
@ -105,7 +107,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
|
|||
ADVANCE(3);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
if (!((lookahead == '\"') ||
|
||||
if (!((lookahead == 0) ||
|
||||
(lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
|
|
|
|||
|
|
@ -320,6 +320,31 @@ describe("Parser", [&]() {
|
|||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("lexing", [&]() {
|
||||
before_each([&]() {
|
||||
ts_document_set_language(doc, ts_language_arithmetic());
|
||||
});
|
||||
|
||||
describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
|
||||
it("terminates them at the end of the document", [&]() {
|
||||
ts_document_set_language(doc, ts_language_arithmetic());
|
||||
|
||||
set_text("x # this is a comment");
|
||||
|
||||
AssertThat(ts_node_string(root), Equals("(DOCUMENT "
|
||||
"(expression (variable) (comment)))"));
|
||||
|
||||
TSNode *expression = ts_node_child(root, 0);
|
||||
TSNode *comment = ts_node_child(expression, 1);
|
||||
|
||||
AssertThat(ts_node_size(comment), Equals(strlen("# this is a comment")));
|
||||
|
||||
ts_node_release(expression);
|
||||
ts_node_release(comment);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
END_TEST
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ size_t CharacterSet::hash_code() const {
|
|||
result ^= hash<size_t>()(included_chars.size());
|
||||
for (auto &c : included_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
result <<= 1;
|
||||
result ^= hash<size_t>()(excluded_chars.size());
|
||||
for (auto &c : excluded_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
|
|
@ -118,6 +119,8 @@ string CharacterSet::to_string() const {
|
|||
|
||||
CharacterSet &CharacterSet::include_all() {
|
||||
includes_all = true;
|
||||
included_chars = {};
|
||||
excluded_chars = { 0 };
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue