Fix parsing of wildcard patterns at the ends of documents

- Remove special EOF handling from lexer
- Explicitly exclude the EOF character from all-inclusive character sets.
This commit is contained in:
Max Brunsfeld 2014-09-11 13:10:23 -07:00
parent a2b80098b2
commit 68d6e242ee
8 changed files with 98 additions and 39 deletions

View file

@ -96,10 +96,7 @@ struct TSLanguage {
#define ADVANCE(state_index) \
{ \
DEBUG_LEX("ADVANCE %d", state_index); \
if (!ts_lexer_advance(lexer)) { \
DEBUG_LEX("END"); \
return ts_lexer_accept(lexer, ts_builtin_sym_end, 0); \
} \
ts_lexer_advance(lexer); \
lex_state = state_index; \
goto next_state; \
}

View file

@ -25,7 +25,7 @@ extern const Grammar arithmetic = Grammar({
{ "group", in_parens(err(sym("expression"))) },
{ "number", pattern("\\d+") },
{ "variable", pattern("\\a[\\w_]*") },
{ "variable", pattern("\\a[\\w]*") },
{ "comment", pattern("#.*") },
}).ubiquitous_tokens({

View file

@ -77,7 +77,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(5);
LEX_ERROR();
case 2:
if (!(lookahead == '\n'))
if (!((lookahead == 0) ||
(lookahead == '\n')))
ADVANCE(2);
ACCEPT_TOKEN(ts_sym_comment);
case 3:

View file

@ -261,7 +261,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(4);
LEX_ERROR();
case 4:
if (!(lookahead == '\n'))
if (!((lookahead == 0) ||
(lookahead == '\n')))
ADVANCE(4);
ACCEPT_TOKEN(ts_sym_comment);
case 5:
@ -446,7 +447,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(37);
if (lookahead == '\\')
ADVANCE(38);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
LEX_ERROR();
@ -457,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(39);
if (lookahead == '\\')
ADVANCE(38);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
LEX_ERROR();
@ -466,7 +469,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(37);
if (lookahead == '\\')
ADVANCE(38);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
ACCEPT_TOKEN(ts_sym_string);

View file

@ -373,7 +373,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(6);
if (lookahead == '\\')
ADVANCE(7);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(5);
LEX_ERROR();
@ -384,7 +385,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(8);
if (lookahead == '\\')
ADVANCE(7);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(5);
LEX_ERROR();
@ -393,7 +395,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(6);
if (lookahead == '\\')
ADVANCE(7);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(5);
ACCEPT_TOKEN(ts_sym_string);
@ -410,7 +413,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(6);
if (lookahead == '\\')
ADVANCE(11);
if (!((lookahead == '\'') ||
if (!((lookahead == 0) ||
(lookahead == '\'') ||
(lookahead == '\\')))
ADVANCE(10);
LEX_ERROR();
@ -419,7 +423,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(12);
if (lookahead == '\\')
ADVANCE(11);
if (!((lookahead == '\'') ||
if (!((lookahead == 0) ||
(lookahead == '\'') ||
(lookahead == '\\')))
ADVANCE(10);
LEX_ERROR();
@ -428,7 +433,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(6);
if (lookahead == '\\')
ADVANCE(11);
if (!((lookahead == '\'') ||
if (!((lookahead == 0) ||
(lookahead == '\'') ||
(lookahead == '\\')))
ADVANCE(10);
ACCEPT_TOKEN(ts_sym_string);
@ -453,7 +459,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(31);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);
@ -465,7 +472,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(25);
if (lookahead == '\\')
ADVANCE(23);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(19);
@ -475,7 +483,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(21);
if (lookahead == '\\')
ADVANCE(23);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(19);
LEX_ERROR();
@ -492,7 +501,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(24);
if (lookahead == '\\')
ADVANCE(23);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(19);
@ -506,7 +516,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(23);
if (lookahead == 'g')
ADVANCE(30);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\') ||
(lookahead == 'g')))
@ -517,14 +528,16 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(26);
if (lookahead == 'g')
ADVANCE(29);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == 'g')))
ADVANCE(28);
ACCEPT_TOKEN(ts_sym_regex);
case 26:
if (lookahead == '/')
ADVANCE(27);
if (!(lookahead == '/'))
if (!((lookahead == 0) ||
(lookahead == '/')))
ADVANCE(28);
LEX_ERROR();
case 27:
@ -532,13 +545,15 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
case 28:
if (lookahead == '*')
ADVANCE(26);
if (!(lookahead == '*'))
if (!((lookahead == 0) ||
(lookahead == '*')))
ADVANCE(28);
LEX_ERROR();
case 29:
if (lookahead == '*')
ADVANCE(26);
if (!(lookahead == '*'))
if (!((lookahead == 0) ||
(lookahead == '*')))
ADVANCE(28);
ACCEPT_TOKEN(ts_sym_regex);
case 30:
@ -548,7 +563,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(25);
if (lookahead == '\\')
ADVANCE(23);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(19);
@ -556,16 +572,19 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
case 31:
if (lookahead == 'g')
ADVANCE(32);
if (!((lookahead == '\n') ||
if (!((lookahead == 0) ||
(lookahead == '\n') ||
(lookahead == 'g')))
ADVANCE(33);
ACCEPT_TOKEN(ts_sym_comment);
case 32:
if (!(lookahead == '\n'))
if (!((lookahead == 0) ||
(lookahead == '\n')))
ADVANCE(33);
ACCEPT_TOKEN(ts_sym_comment);
case 33:
if (!(lookahead == '\n'))
if (!((lookahead == 0) ||
(lookahead == '\n')))
ADVANCE(33);
ACCEPT_TOKEN(ts_sym_comment);
case 34:
@ -573,7 +592,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(35);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);
LEX_ERROR();
@ -584,7 +604,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(34);
if (lookahead == 'g')
ADVANCE(37);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\') ||
(lookahead == 'g')))
ADVANCE(38);
@ -598,7 +619,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(36);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);
ACCEPT_TOKEN(ts_sym_regex);
@ -607,7 +629,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(36);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);
LEX_ERROR();
@ -2672,7 +2695,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(195);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '=') ||
(lookahead == '\\')))
@ -2683,7 +2707,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(36);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '/') ||
if (!((lookahead == 0) ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);
ACCEPT_TOKEN(ts_aux_sym_33);
@ -3074,7 +3099,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(31);
if (lookahead == '\\')
ADVANCE(34);
if (!((lookahead == '*') ||
if (!((lookahead == 0) ||
(lookahead == '*') ||
(lookahead == '/') ||
(lookahead == '\\')))
ADVANCE(38);

View file

@ -85,7 +85,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(3);
if (lookahead == '\\')
ADVANCE(4);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
LEX_ERROR();
@ -96,7 +97,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(5);
if (lookahead == '\\')
ADVANCE(4);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
LEX_ERROR();
@ -105,7 +107,8 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
ADVANCE(3);
if (lookahead == '\\')
ADVANCE(4);
if (!((lookahead == '\"') ||
if (!((lookahead == 0) ||
(lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
ACCEPT_TOKEN(ts_sym_string);

View file

@ -320,6 +320,31 @@ describe("Parser", [&]() {
});
});
});
describe("lexing", [&]() {
before_each([&]() {
ts_document_set_language(doc, ts_language_arithmetic());
});
describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
it("terminates them at the end of the document", [&]() {
ts_document_set_language(doc, ts_language_arithmetic());
set_text("x # this is a comment");
AssertThat(ts_node_string(root), Equals("(DOCUMENT "
"(expression (variable) (comment)))"));
TSNode *expression = ts_node_child(root, 0);
TSNode *comment = ts_node_child(expression, 1);
AssertThat(ts_node_size(comment), Equals(strlen("# this is a comment")));
ts_node_release(expression);
ts_node_release(comment);
});
});
});
});
END_TEST

View file

@ -87,6 +87,7 @@ size_t CharacterSet::hash_code() const {
result ^= hash<size_t>()(included_chars.size());
for (auto &c : included_chars)
result ^= hash<uint32_t>()(c);
result <<= 1;
result ^= hash<size_t>()(excluded_chars.size());
for (auto &c : excluded_chars)
result ^= hash<uint32_t>()(c);
@ -118,6 +119,8 @@ string CharacterSet::to_string() const {
CharacterSet &CharacterSet::include_all() {
includes_all = true;
included_chars = {};
excluded_chars = { 0 };
return *this;
}