From 4b04afac5e6c92549b3fc187403c4c5b2545bc3e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 30 Dec 2015 09:37:40 -0800 Subject: [PATCH] Control lexer's error-mode via explicit boolean argument Previously, the lexer would operate in error-mode (ignoring any garbage input until it found a valid token) if it was invoked in the 'error' state. Now that the error state is deduped with other lexical states, the lexer might be invoked in that state even when error-mode is not intended. This adds a third argument to `ts_lex` that explicitly sets the error-mode. This bug was unlikely to occur in any real grammars, but it caused the node-tree-sitter-compiler test suite to fail for some grammars with only one rule. --- include/tree_sitter/parser.h | 37 ++--- spec/fixtures/parsers/anonymous_tokens.c | 36 ++--- spec/fixtures/parsers/arithmetic.c | 68 ++++---- spec/fixtures/parsers/c.c | 148 ++++++++--------- spec/fixtures/parsers/cpp.c | 156 +++++++++--------- spec/fixtures/parsers/golang.c | 172 ++++++++++---------- spec/fixtures/parsers/javascript.c | 194 +++++++++++------------ spec/fixtures/parsers/json.c | 68 ++++---- src/compiler/generate_code/c_code.cc | 11 +- src/runtime/parser.c | 4 +- 10 files changed, 442 insertions(+), 452 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 5ce5cd2e..e93d633e 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -87,7 +87,7 @@ struct TSLanguage { const unsigned short *parse_table; const TSParseActionEntry *parse_actions; const TSStateId *lex_states; - TSTree *(*lex_fn)(TSLexer *, TSStateId); + TSTree *(*lex_fn)(TSLexer *, TSStateId, bool); }; /* @@ -95,24 +95,23 @@ struct TSLanguage { */ #define START_LEXER() \ - const bool error_mode = (lex_state == ts_lex_state_error); \ - lexer->start_fn(lexer, lex_state); \ + lexer->start_fn(lexer, state); \ int32_t lookahead; \ next_state: \ lookahead = lexer->lookahead; #define START_TOKEN() lexer->start_token_fn(lexer); -#define GO_TO_STATE(state_index) \ +#define GO_TO_STATE(state_value) \ { \ - lex_state = state_index; \ + state = state_value; \ goto next_state; \ } -#define ADVANCE(state_index) \ +#define ADVANCE(state_value) \ { \ - lexer->advance_fn(lexer, state_index); \ - GO_TO_STATE(state_index); \ + lexer->advance_fn(lexer, state_value); \ + GO_TO_STATE(state_value); \ } #define ACCEPT_FRAGILE_TOKEN(symbol) \ @@ -123,27 +122,19 @@ struct TSLanguage { return lexer->accept_fn(lexer, symbol, ts_symbol_metadata[symbol], \ ts_symbol_names[symbol], false); -#define LEX_ERROR() \ - if (error_mode) { \ - if (lex_state == ts_lex_state_error) \ - ADVANCE(ts_lex_state_error) \ - else \ - GO_TO_STATE(ts_lex_state_error) \ - } else { \ - ACCEPT_TOKEN(ts_builtin_sym_error) \ +#define LEX_ERROR() \ + if (error_mode) { \ + if (state == ts_lex_state_error) \ + lexer->advance_fn(lexer, state); \ + GO_TO_STATE(ts_lex_state_error) \ + } else { \ + ACCEPT_TOKEN(ts_builtin_sym_error) \ } /* * Parse Table Macros */ -#define ACTIONS(...) \ - (TSParseAction[]) { \ - __VA_ARGS__, { \ - .type = 0 \ - } \ - } - enum { FRAGILE = 1, CAN_HIDE_SPLIT = 2, diff --git a/spec/fixtures/parsers/anonymous_tokens.c b/spec/fixtures/parsers/anonymous_tokens.c index 3cc44655..bc6ede0f 100644 --- a/spec/fixtures/parsers/anonymous_tokens.c +++ b/spec/fixtures/parsers/anonymous_tokens.c @@ -31,9 +31,25 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [anon_sym_DQUOTEhello_DQUOTE] = {.visible = true, .named = false, .structural = true, .extra = false}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '\n') + ADVANCE(2); + if (lookahead == '\r') + ADVANCE(3); + if (lookahead == '\"') + ADVANCE(4); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(11); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -94,22 +110,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { (lookahead == ' ')) ADVANCE(13); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '\n') - ADVANCE(2); - if (lookahead == '\r') - ADVANCE(3); - if (lookahead == '\"') - ADVANCE(4); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(11); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 02f7a1b4..a7544629 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -70,9 +70,41 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym_comment] = {.visible = true, .named = true, .structural = false, .extra = true}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\n') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '#') + ADVANCE(2); + if (lookahead == '(') + ADVANCE(3); + if (lookahead == ')') + ADVANCE(4); + if (lookahead == '*') + ADVANCE(5); + if (lookahead == '+') + ADVANCE(6); + if (lookahead == '-') + ADVANCE(7); + if (lookahead == '/') + ADVANCE(8); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(9); + if (('A' <= lookahead && lookahead <= 'Z') || + ('a' <= lookahead && lookahead <= 'z') || + (945 <= lookahead && lookahead <= 969)) + ADVANCE(10); + if (lookahead == '^') + ADVANCE(11); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -192,38 +224,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { if (lookahead == ')') ADVANCE(4); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\n') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '#') - ADVANCE(2); - if (lookahead == '(') - ADVANCE(3); - if (lookahead == ')') - ADVANCE(4); - if (lookahead == '*') - ADVANCE(5); - if (lookahead == '+') - ADVANCE(6); - if (lookahead == '-') - ADVANCE(7); - if (lookahead == '/') - ADVANCE(8); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(9); - if (('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'z') || - (945 <= lookahead && lookahead <= 969)) - ADVANCE(10); - if (lookahead == '^') - ADVANCE(11); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/c.c b/spec/fixtures/parsers/c.c index eefa4ed1..0a927585 100644 --- a/spec/fixtures/parsers/c.c +++ b/spec/fixtures/parsers/c.c @@ -256,9 +256,81 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym_comment] = {.visible = true, .named = true, .structural = false, .extra = true}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '\n') + ADVANCE(2); + if (lookahead == '\"') + ADVANCE(3); + if (lookahead == '#') + ADVANCE(7); + if (lookahead == '&') + ADVANCE(14); + if (lookahead == '(') + ADVANCE(15); + if (lookahead == ')') + ADVANCE(16); + if (lookahead == '*') + ADVANCE(17); + if (lookahead == '+') + ADVANCE(18); + if (lookahead == ',') + ADVANCE(19); + if (lookahead == '.') + ADVANCE(20); + if (lookahead == '/') + ADVANCE(21); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(26); + if (lookahead == ';') + ADVANCE(29); + if (lookahead == '=') + ADVANCE(30); + if (('A' <= lookahead && lookahead <= 'Z') || + (lookahead == 'b') || + (lookahead == 'd') || + ('g' <= lookahead && lookahead <= 'k') || + ('m' <= lookahead && lookahead <= 'q') || + ('w' <= lookahead && lookahead <= 'z')) + ADVANCE(31); + if (lookahead == '[') + ADVANCE(32); + if (lookahead == ']') + ADVANCE(33); + if (lookahead == 'a') + ADVANCE(34); + if (lookahead == 'c') + ADVANCE(38); + if (lookahead == 'e') + ADVANCE(43); + if (lookahead == 'f') + ADVANCE(49); + if (lookahead == 'l') + ADVANCE(52); + if (lookahead == 'r') + ADVANCE(56); + if (lookahead == 's') + ADVANCE(70); + if (lookahead == 't') + ADVANCE(89); + if (lookahead == 'u') + ADVANCE(96); + if (lookahead == 'v') + ADVANCE(104); + if (lookahead == '{') + ADVANCE(112); + if (lookahead == '}') + ADVANCE(113); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -2408,78 +2480,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { if (lookahead == '{') ADVANCE(112); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '\n') - ADVANCE(2); - if (lookahead == '\"') - ADVANCE(3); - if (lookahead == '#') - ADVANCE(7); - if (lookahead == '&') - ADVANCE(14); - if (lookahead == '(') - ADVANCE(15); - if (lookahead == ')') - ADVANCE(16); - if (lookahead == '*') - ADVANCE(17); - if (lookahead == '+') - ADVANCE(18); - if (lookahead == ',') - ADVANCE(19); - if (lookahead == '.') - ADVANCE(20); - if (lookahead == '/') - ADVANCE(21); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(26); - if (lookahead == ';') - ADVANCE(29); - if (lookahead == '=') - ADVANCE(30); - if (('A' <= lookahead && lookahead <= 'Z') || - (lookahead == 'b') || - (lookahead == 'd') || - ('g' <= lookahead && lookahead <= 'k') || - ('m' <= lookahead && lookahead <= 'q') || - ('w' <= lookahead && lookahead <= 'z')) - ADVANCE(31); - if (lookahead == '[') - ADVANCE(32); - if (lookahead == ']') - ADVANCE(33); - if (lookahead == 'a') - ADVANCE(34); - if (lookahead == 'c') - ADVANCE(38); - if (lookahead == 'e') - ADVANCE(43); - if (lookahead == 'f') - ADVANCE(49); - if (lookahead == 'l') - ADVANCE(52); - if (lookahead == 'r') - ADVANCE(56); - if (lookahead == 's') - ADVANCE(70); - if (lookahead == 't') - ADVANCE(89); - if (lookahead == 'u') - ADVANCE(96); - if (lookahead == 'v') - ADVANCE(104); - if (lookahead == '{') - ADVANCE(112); - if (lookahead == '}') - ADVANCE(113); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/cpp.c b/spec/fixtures/parsers/cpp.c index 8383b2b4..57a76a3f 100644 --- a/spec/fixtures/parsers/cpp.c +++ b/spec/fixtures/parsers/cpp.c @@ -259,9 +259,85 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym_comment] = {.visible = true, .named = true, .structural = false, .extra = true}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\n') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '!') + ADVANCE(2); + if (lookahead == '\"') + ADVANCE(4); + if (lookahead == '&') + ADVANCE(8); + if (lookahead == '(') + ADVANCE(10); + if (lookahead == ')') + ADVANCE(11); + if (lookahead == '*') + ADVANCE(12); + if (lookahead == ',') + ADVANCE(13); + if (lookahead == '.') + ADVANCE(14); + if (lookahead == '/') + ADVANCE(17); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(19); + if (lookahead == ':') + ADVANCE(22); + if (lookahead == ';') + ADVANCE(24); + if (lookahead == '<') + ADVANCE(25); + if (lookahead == '=') + ADVANCE(27); + if (lookahead == '>') + ADVANCE(29); + if (('A' <= lookahead && lookahead <= 'Z') || + (lookahead == 'a') || + (lookahead == 'b') || + (lookahead == 'g') || + (lookahead == 'h') || + ('j' <= lookahead && lookahead <= 'l') || + ('o' <= lookahead && lookahead <= 'q') || + (lookahead == 'u') || + ('w' <= lookahead && lookahead <= 'z')) + ADVANCE(31); + if (lookahead == 'c') + ADVANCE(32); + if (lookahead == 'd') + ADVANCE(41); + if (lookahead == 'e') + ADVANCE(52); + if (lookahead == 'f') + ADVANCE(64); + if (lookahead == 'i') + ADVANCE(70); + if (lookahead == 'm') + ADVANCE(90); + if (lookahead == 'n') + ADVANCE(97); + if (lookahead == 'r') + ADVANCE(106); + if (lookahead == 's') + ADVANCE(114); + if (lookahead == 't') + ADVANCE(120); + if (lookahead == 'v') + ADVANCE(138); + if (lookahead == '{') + ADVANCE(152); + if (lookahead == '}') + ADVANCE(153); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -3095,82 +3171,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { LEX_ERROR(); case 259: ACCEPT_TOKEN(anon_sym_initializer_list); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\n') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '!') - ADVANCE(2); - if (lookahead == '\"') - ADVANCE(4); - if (lookahead == '&') - ADVANCE(8); - if (lookahead == '(') - ADVANCE(10); - if (lookahead == ')') - ADVANCE(11); - if (lookahead == '*') - ADVANCE(12); - if (lookahead == ',') - ADVANCE(13); - if (lookahead == '.') - ADVANCE(14); - if (lookahead == '/') - ADVANCE(17); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(19); - if (lookahead == ':') - ADVANCE(22); - if (lookahead == ';') - ADVANCE(24); - if (lookahead == '<') - ADVANCE(25); - if (lookahead == '=') - ADVANCE(27); - if (lookahead == '>') - ADVANCE(29); - if (('A' <= lookahead && lookahead <= 'Z') || - (lookahead == 'a') || - (lookahead == 'b') || - (lookahead == 'g') || - (lookahead == 'h') || - ('j' <= lookahead && lookahead <= 'l') || - ('o' <= lookahead && lookahead <= 'q') || - (lookahead == 'u') || - ('w' <= lookahead && lookahead <= 'z')) - ADVANCE(31); - if (lookahead == 'c') - ADVANCE(32); - if (lookahead == 'd') - ADVANCE(41); - if (lookahead == 'e') - ADVANCE(52); - if (lookahead == 'f') - ADVANCE(64); - if (lookahead == 'i') - ADVANCE(70); - if (lookahead == 'm') - ADVANCE(90); - if (lookahead == 'n') - ADVANCE(97); - if (lookahead == 'r') - ADVANCE(106); - if (lookahead == 's') - ADVANCE(114); - if (lookahead == 't') - ADVANCE(120); - if (lookahead == 'v') - ADVANCE(138); - if (lookahead == '{') - ADVANCE(152); - if (lookahead == '}') - ADVANCE(153); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/golang.c b/spec/fixtures/parsers/golang.c index 9f123abc..95ab52df 100644 --- a/spec/fixtures/parsers/golang.c +++ b/spec/fixtures/parsers/golang.c @@ -262,9 +262,93 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym_comment] = {.visible = true, .named = true, .structural = false, .extra = true}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '\n') + ADVANCE(2); + if (lookahead == '!') + ADVANCE(3); + if (lookahead == '\"') + ADVANCE(4); + if (lookahead == '&') + ADVANCE(8); + if (lookahead == '(') + ADVANCE(10); + if (lookahead == ')') + ADVANCE(11); + if (lookahead == '*') + ADVANCE(12); + if (lookahead == '+') + ADVANCE(13); + if (lookahead == ',') + ADVANCE(14); + if (lookahead == '-') + ADVANCE(15); + if (lookahead == '.') + ADVANCE(16); + if (lookahead == '/') + ADVANCE(17); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(19); + if (lookahead == ':') + ADVANCE(22); + if (lookahead == ';') + ADVANCE(24); + if (lookahead == '<') + ADVANCE(25); + if (lookahead == '=') + ADVANCE(27); + if (lookahead == '>') + ADVANCE(29); + if (('A' <= lookahead && lookahead <= 'Z') || + ('a' <= lookahead && lookahead <= 'd') || + (lookahead == 'g') || + (lookahead == 'h') || + ('j' <= lookahead && lookahead <= 'l') || + (lookahead == 'n') || + (lookahead == 'o') || + (lookahead == 'q') || + (lookahead == 'u') || + ('w' <= lookahead && lookahead <= 'z')) + ADVANCE(31); + if (lookahead == '[') + ADVANCE(32); + if (lookahead == ']') + ADVANCE(33); + if (lookahead == 'e') + ADVANCE(34); + if (lookahead == 'f') + ADVANCE(38); + if (lookahead == 'i') + ADVANCE(44); + if (lookahead == 'm') + ADVANCE(59); + if (lookahead == 'p') + ADVANCE(62); + if (lookahead == 'r') + ADVANCE(69); + if (lookahead == 's') + ADVANCE(79); + if (lookahead == 't') + ADVANCE(85); + if (lookahead == 'v') + ADVANCE(89); + if (lookahead == '{') + ADVANCE(92); + if (lookahead == '|') + ADVANCE(93); + if (lookahead == '}') + ADVANCE(95); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -2159,90 +2243,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { if (lookahead == '/') ADVANCE(97); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '\n') - ADVANCE(2); - if (lookahead == '!') - ADVANCE(3); - if (lookahead == '\"') - ADVANCE(4); - if (lookahead == '&') - ADVANCE(8); - if (lookahead == '(') - ADVANCE(10); - if (lookahead == ')') - ADVANCE(11); - if (lookahead == '*') - ADVANCE(12); - if (lookahead == '+') - ADVANCE(13); - if (lookahead == ',') - ADVANCE(14); - if (lookahead == '-') - ADVANCE(15); - if (lookahead == '.') - ADVANCE(16); - if (lookahead == '/') - ADVANCE(17); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(19); - if (lookahead == ':') - ADVANCE(22); - if (lookahead == ';') - ADVANCE(24); - if (lookahead == '<') - ADVANCE(25); - if (lookahead == '=') - ADVANCE(27); - if (lookahead == '>') - ADVANCE(29); - if (('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'd') || - (lookahead == 'g') || - (lookahead == 'h') || - ('j' <= lookahead && lookahead <= 'l') || - (lookahead == 'n') || - (lookahead == 'o') || - (lookahead == 'q') || - (lookahead == 'u') || - ('w' <= lookahead && lookahead <= 'z')) - ADVANCE(31); - if (lookahead == '[') - ADVANCE(32); - if (lookahead == ']') - ADVANCE(33); - if (lookahead == 'e') - ADVANCE(34); - if (lookahead == 'f') - ADVANCE(38); - if (lookahead == 'i') - ADVANCE(44); - if (lookahead == 'm') - ADVANCE(59); - if (lookahead == 'p') - ADVANCE(62); - if (lookahead == 'r') - ADVANCE(69); - if (lookahead == 's') - ADVANCE(79); - if (lookahead == 't') - ADVANCE(85); - if (lookahead == 'v') - ADVANCE(89); - if (lookahead == '{') - ADVANCE(92); - if (lookahead == '|') - ADVANCE(93); - if (lookahead == '}') - ADVANCE(95); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/javascript.c b/spec/fixtures/parsers/javascript.c index afdf715c..e8e7f166 100644 --- a/spec/fixtures/parsers/javascript.c +++ b/spec/fixtures/parsers/javascript.c @@ -331,9 +331,104 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym__line_break] = {.visible = false, .named = false, .structural = true, .extra = true}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '\n') + ADVANCE(2); + if (lookahead == '!') + ADVANCE(3); + if (lookahead == '\"') + ADVANCE(6); + if ((lookahead == '$') || + ('A' <= lookahead && lookahead <= 'Z') || + (lookahead == '_') || + (lookahead == 'a') || + (lookahead == 'g') || + (lookahead == 'h') || + ('j' <= lookahead && lookahead <= 'm') || + ('o' <= lookahead && lookahead <= 'q') || + ('x' <= lookahead && lookahead <= 'z')) + ADVANCE(10); + if (lookahead == '&') + ADVANCE(11); + if (lookahead == '\'') + ADVANCE(13); + if (lookahead == '(') + ADVANCE(16); + if (lookahead == ')') + ADVANCE(17); + if (lookahead == '*') + ADVANCE(18); + if (lookahead == '+') + ADVANCE(20); + if (lookahead == ',') + ADVANCE(23); + if (lookahead == '-') + ADVANCE(24); + if (lookahead == '.') + ADVANCE(27); + if (lookahead == '/') + ADVANCE(28); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(50); + if (lookahead == ':') + ADVANCE(53); + if (lookahead == ';') + ADVANCE(54); + if (lookahead == '<') + ADVANCE(55); + if (lookahead == '=') + ADVANCE(56); + if (lookahead == '>') + ADVANCE(59); + if (lookahead == '?') + ADVANCE(60); + if (lookahead == '[') + ADVANCE(61); + if (lookahead == ']') + ADVANCE(62); + if (lookahead == 'b') + ADVANCE(63); + if (lookahead == 'c') + ADVANCE(68); + if (lookahead == 'd') + ADVANCE(75); + if (lookahead == 'e') + ADVANCE(86); + if (lookahead == 'f') + ADVANCE(90); + if (lookahead == 'i') + ADVANCE(110); + if (lookahead == 'n') + ADVANCE(121); + if (lookahead == 'r') + ADVANCE(127); + if (lookahead == 's') + ADVANCE(133); + if (lookahead == 't') + ADVANCE(139); + if (lookahead == 'u') + ADVANCE(153); + if (lookahead == 'v') + ADVANCE(162); + if (lookahead == 'w') + ADVANCE(165); + if (lookahead == '{') + ADVANCE(170); + if (lookahead == '|') + ADVANCE(171); + if (lookahead == '}') + ADVANCE(173); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -6055,101 +6150,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { if (lookahead == '{') ADVANCE(170); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '\n') - ADVANCE(2); - if (lookahead == '!') - ADVANCE(3); - if (lookahead == '\"') - ADVANCE(6); - if ((lookahead == '$') || - ('A' <= lookahead && lookahead <= 'Z') || - (lookahead == '_') || - (lookahead == 'a') || - (lookahead == 'g') || - (lookahead == 'h') || - ('j' <= lookahead && lookahead <= 'm') || - ('o' <= lookahead && lookahead <= 'q') || - ('x' <= lookahead && lookahead <= 'z')) - ADVANCE(10); - if (lookahead == '&') - ADVANCE(11); - if (lookahead == '\'') - ADVANCE(13); - if (lookahead == '(') - ADVANCE(16); - if (lookahead == ')') - ADVANCE(17); - if (lookahead == '*') - ADVANCE(18); - if (lookahead == '+') - ADVANCE(20); - if (lookahead == ',') - ADVANCE(23); - if (lookahead == '-') - ADVANCE(24); - if (lookahead == '.') - ADVANCE(27); - if (lookahead == '/') - ADVANCE(28); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(50); - if (lookahead == ':') - ADVANCE(53); - if (lookahead == ';') - ADVANCE(54); - if (lookahead == '<') - ADVANCE(55); - if (lookahead == '=') - ADVANCE(56); - if (lookahead == '>') - ADVANCE(59); - if (lookahead == '?') - ADVANCE(60); - if (lookahead == '[') - ADVANCE(61); - if (lookahead == ']') - ADVANCE(62); - if (lookahead == 'b') - ADVANCE(63); - if (lookahead == 'c') - ADVANCE(68); - if (lookahead == 'd') - ADVANCE(75); - if (lookahead == 'e') - ADVANCE(86); - if (lookahead == 'f') - ADVANCE(90); - if (lookahead == 'i') - ADVANCE(110); - if (lookahead == 'n') - ADVANCE(121); - if (lookahead == 'r') - ADVANCE(127); - if (lookahead == 's') - ADVANCE(133); - if (lookahead == 't') - ADVANCE(139); - if (lookahead == 'u') - ADVANCE(153); - if (lookahead == 'v') - ADVANCE(162); - if (lookahead == 'w') - ADVANCE(165); - if (lookahead == '{') - ADVANCE(170); - if (lookahead == '|') - ADVANCE(171); - if (lookahead == '}') - ADVANCE(173); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index 27f1acca..2270fcc6 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -64,9 +64,41 @@ static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = { [sym_false] = {.visible = true, .named = true, .structural = true, .extra = false}, }; -static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { +static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) { START_LEXER(); - switch (lex_state) { + switch (state) { + case 0: + START_TOKEN(); + if (lookahead == 0) + ADVANCE(1); + if ((lookahead == '\t') || + (lookahead == '\n') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(0); + if (lookahead == '\"') + ADVANCE(2); + if (lookahead == ',') + ADVANCE(6); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(7); + if (lookahead == ':') + ADVANCE(10); + if (lookahead == '[') + ADVANCE(11); + if (lookahead == ']') + ADVANCE(12); + if (lookahead == 'f') + ADVANCE(13); + if (lookahead == 'n') + ADVANCE(18); + if (lookahead == 't') + ADVANCE(22); + if (lookahead == '{') + ADVANCE(26); + if (lookahead == '}') + ADVANCE(27); + LEX_ERROR(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); case 2: @@ -305,38 +337,6 @@ static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) { if (lookahead == '\"') ADVANCE(2); LEX_ERROR(); - case ts_lex_state_error: - START_TOKEN(); - if (lookahead == 0) - ADVANCE(1); - if ((lookahead == '\t') || - (lookahead == '\n') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(0); - if (lookahead == '\"') - ADVANCE(2); - if (lookahead == ',') - ADVANCE(6); - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(7); - if (lookahead == ':') - ADVANCE(10); - if (lookahead == '[') - ADVANCE(11); - if (lookahead == ']') - ADVANCE(12); - if (lookahead == 'f') - ADVANCE(13); - if (lookahead == 'n') - ADVANCE(18); - if (lookahead == 't') - ADVANCE(22); - if (lookahead == '{') - ADVANCE(26); - if (lookahead == '}') - ADVANCE(27); - LEX_ERROR(); default: LEX_ERROR(); } diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 664ee40c..7e96cdcf 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -188,14 +188,13 @@ class CCodeGenerator { } void add_lex_function() { - line("static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {"); + line("static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) {"); indent([&]() { line("START_LEXER();"); - _switch("lex_state", [&]() { - for (size_t i = 1; i < lex_table.states.size(); i++) - _case(to_string(i), [&]() { add_lex_state(lex_table.states[i]); }); - _case("ts_lex_state_error", - [&]() { add_lex_state(lex_table.states[0]); }); + _switch("state", [&]() { + size_t i = 0; + for (const LexState &state : lex_table.states) + _case(to_string(i++), [&]() { add_lex_state(state); }); _default([&]() { line("LEX_ERROR();"); }); }); }); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index d2971a1a..92595225 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -203,7 +203,7 @@ static TSTree *ts_parser__get_next_lookahead(TSParser *self, int head) { TSStateId parse_state = ts_stack_top_state(self->stack, head); TSStateId lex_state = self->language->lex_states[parse_state]; LOG("lex state:%d", lex_state); - return self->language->lex_fn(&self->lexer, lex_state); + return self->language->lex_fn(&self->lexer, lex_state, false); } static int ts_parser__split(TSParser *self, int head) { @@ -464,7 +464,7 @@ static bool ts_parser__handle_error(TSParser *self, int head, TSTree *lookahead) LOG("skip token:%s", SYM_NAME(lookahead->symbol)); ts_parser__shift(self, head, ts_stack_top_state(self->stack, head), lookahead); - lookahead = self->language->lex_fn(&self->lexer, ts_lex_state_error); + lookahead = self->language->lex_fn(&self->lexer, 0, true); error_token_count++; /*