diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 7b59e14f..5d89efc2 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -28,19 +28,11 @@ typedef struct { bool structural : 1; } TSSymbolMetadata; -typedef enum { - TSTransitionTypeMain, - TSTransitionTypeSeparator, - TSTransitionTypeError, -} TSTransitionType; - typedef struct TSLexer { - void (*advance)(struct TSLexer *, TSStateId, TSTransitionType); + void (*advance)(struct TSLexer *, TSStateId, bool); TSLength current_position; - TSLength token_end_position; TSLength token_start_position; - TSLength error_end_position; const char *chunk; size_t chunk_start; @@ -48,10 +40,7 @@ typedef struct TSLexer { size_t lookahead_size; int32_t lookahead; - TSStateId starting_state; TSSymbol result_symbol; - bool result_follows_error; - int32_t first_unexpected_character; TSInput input; TSDebugger debugger; @@ -94,7 +83,7 @@ struct TSLanguage { const unsigned short *parse_table; const TSParseActionEntry *parse_actions; const TSStateId *lex_states; - bool (*lex_fn)(TSLexer *, TSStateId, bool); + bool (*lex_fn)(TSLexer *, TSStateId); }; /* @@ -106,22 +95,18 @@ struct TSLanguage { next_state: \ lookahead = lexer->lookahead; -#define GO_TO_STATE(state_value) \ - { \ +#define ADVANCE(state_value) \ + { \ + lexer->advance(lexer, state_value, false); \ state = state_value; \ goto next_state; \ } -#define ADVANCE(state_value) \ - { \ - lexer->advance(lexer, state_value, TSTransitionTypeMain); \ - GO_TO_STATE(state_value); \ - } - #define SKIP(state_value) \ { \ - lexer->advance(lexer, state_value, TSTransitionTypeSeparator); \ - GO_TO_STATE(state_value); \ + lexer->advance(lexer, state_value, true); \ + state = state_value; \ + goto next_state; \ } #define ACCEPT_TOKEN(symbol_value) \ @@ -130,14 +115,7 @@ struct TSLanguage { return true; \ } -#define LEX_ERROR() \ - if (error_mode) { \ - if (state == TS_STATE_ERROR) \ - lexer->advance(lexer, state, TSTransitionTypeError); \ - GO_TO_STATE(TS_STATE_ERROR); \ - } else { \ - return false; \ - } +#define LEX_ERROR() return false /* * Parse Table Macros diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index ece011f3..837f3f4e 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -162,6 +162,15 @@ describe("Parser", [&]() { AssertThat(get_node_text(last), Equals("true")); }); }); + + describe("when there is an unterminated error", [&]() { + it("maintains a consistent tree", [&]() { + ts_document_set_language(doc, get_test_language("javascript")); + set_text("a; /* b"); + assert_root_node( + "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); + }); + }); }); describe("handling extra tokens", [&]() { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index f0cfa129..7ee66d5a 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -184,7 +184,7 @@ class CCodeGenerator { void add_lex_function() { line( - "static bool ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) {"); + "static bool ts_lex(TSLexer *lexer, TSStateId state) {"); indent([&]() { line("START_LEXER();"); _switch("state", [&]() { diff --git a/src/runtime/length.h b/src/runtime/length.h index fbf2313b..3c16a7d8 100644 --- a/src/runtime/length.h +++ b/src/runtime/length.h @@ -14,6 +14,10 @@ static inline void ts_length_set_unknown(TSLength *self) { self->columns = 0; } +static inline TSLength ts_length_min(TSLength len1, TSLength len2) { + return (len1.chars < len2.chars) ? len1 : len2; +} + static inline TSLength ts_length_add(TSLength len1, TSLength len2) { TSLength result; result.chars = len1.chars + len2.chars; diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index dd58cfc9..85b207b6 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -47,8 +47,7 @@ static void ts_lexer__get_lookahead(TSLexer *self) { LOG_LOOKAHEAD(); } -static void ts_lexer__advance(TSLexer *self, TSStateId state, - TSTransitionType transition_type) { +static void ts_lexer__advance(TSLexer *self, TSStateId state, bool skip) { if (self->chunk == empty_chunk) return; @@ -56,7 +55,6 @@ static void ts_lexer__advance(TSLexer *self, TSStateId state, if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; self->current_position.chars++; - if (self->lookahead == '\n') { self->current_position.rows++; self->current_position.columns = 0; @@ -65,25 +63,11 @@ static void ts_lexer__advance(TSLexer *self, TSStateId state, } } - switch (transition_type) { - case TSTransitionTypeSeparator: - if (self->result_follows_error) { - LOG("skip_error state:%d", state); - } else { - LOG("skip_separator state:%d", state); - self->token_start_position = self->current_position; - } - break; - case TSTransitionTypeError: - LOG("skip_error state:%d", state); - self->result_follows_error = true; - self->error_end_position = self->current_position; - if (!self->first_unexpected_character) - self->first_unexpected_character = self->lookahead; - break; - default: - LOG("advance state:%d", state); - break; + if (skip) { + LOG("skip_separator state:%d", state); + self->token_start_position = self->current_position; + } else { + LOG("advance state:%d", state); } if (self->current_position.bytes >= self->chunk_start + self->chunk_size) @@ -109,7 +93,6 @@ void ts_lexer_init(TSLexer *self) { static inline void ts_lexer__reset(TSLexer *self, TSLength position) { self->token_start_position = position; - self->token_end_position = position; self->current_position = position; self->chunk = 0; @@ -132,34 +115,12 @@ void ts_lexer_reset(TSLexer *self, TSLength position) { void ts_lexer_start(TSLexer *self, TSStateId lex_state) { LOG("start_lex state:%d, pos:%lu", lex_state, self->current_position.chars); - LOG_LOOKAHEAD(); - self->starting_state = lex_state; self->token_start_position = self->current_position; - self->result_follows_error = false; self->result_symbol = 0; - self->first_unexpected_character = 0; if (!self->chunk) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); } - -void ts_lexer_finish(TSLexer *self, TSLexerResult *result) { - result->padding = - ts_length_sub(self->token_start_position, self->token_end_position); - - if (self->result_follows_error) { - result->symbol = ts_builtin_sym_error; - result->size = - ts_length_sub(self->error_end_position, self->token_start_position); - result->first_unexpected_character = self->first_unexpected_character; - ts_lexer_reset(self, self->error_end_position); - } else { - result->symbol = self->result_symbol; - result->size = - ts_length_sub(self->current_position, self->token_start_position); - self->token_end_position = self->current_position; - } -} diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 75a03762..afecb19c 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -7,18 +7,10 @@ extern "C" { #include "tree_sitter/parser.h" -typedef struct { - TSSymbol symbol; - TSLength padding; - TSLength size; - int32_t first_unexpected_character; -} TSLexerResult; - void ts_lexer_init(TSLexer *); void ts_lexer_set_input(TSLexer *, TSInput); void ts_lexer_reset(TSLexer *, TSLength); void ts_lexer_start(TSLexer *, TSStateId); -void ts_lexer_finish(TSLexer *, TSLexerResult *); #ifdef __cplusplus } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 3a33b481..2df15c77 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -242,37 +242,67 @@ static bool parser__condense_stack(Parser *self) { return result; } -static TSTree *parser__lex(Parser *self, TSStateId parse_state, bool error_mode) { - TSStateId state = self->language->lex_states[parse_state]; - LOG("lex state:%d", state); +static TSTree *parser__lex(Parser *self, TSStateId parse_state) { + TSStateId start_state = self->language->lex_states[parse_state]; + TSStateId current_state = start_state; + TSLength start_position = self->lexer.current_position; + TSLength position = start_position; + LOG("lex state:%d", start_state); - TSLength position = self->lexer.current_position; + bool skipped_error = false; + int32_t first_error_character = 0; + TSLength error_start_position, error_end_position; - ts_lexer_start(&self->lexer, state); - if (!self->language->lex_fn(&self->lexer, state, error_mode)) { - ts_lexer_reset(&self->lexer, position); - ts_lexer_start(&self->lexer, state); - assert(self->language->lex_fn(&self->lexer, TS_STATE_ERROR, true)); + ts_lexer_start(&self->lexer, start_state); + + while (!self->language->lex_fn(&self->lexer, current_state)) { + if (current_state != TS_STATE_ERROR) { + LOG("retry_in_error_mode"); + ts_lexer_reset(&self->lexer, position); + ts_lexer_start(&self->lexer, start_state); + current_state = TS_STATE_ERROR; + continue; + } + + if (self->lexer.lookahead == 0) { + self->lexer.result_symbol = ts_builtin_sym_error; + break; + } + + if (self->lexer.current_position.chars == position.chars) { + if (!skipped_error) { + error_start_position = self->lexer.current_position; + first_error_character = self->lexer.lookahead; + } + skipped_error = true; + self->lexer.advance(&self->lexer, TS_STATE_ERROR, false); + error_end_position = self->lexer.current_position; + } + + position = self->lexer.current_position; } - TSLexerResult lex_result; - ts_lexer_finish(&self->lexer, &lex_result); - TSTree *result; - if (lex_result.symbol == ts_builtin_sym_error) { - result = ts_tree_make_error(lex_result.size, lex_result.padding, - lex_result.first_unexpected_character); + + if (skipped_error) { + error_start_position = ts_length_min(error_start_position, self->lexer.token_start_position); + TSLength padding = ts_length_sub(error_start_position, start_position); + TSLength size = ts_length_sub(error_end_position, error_start_position); + ts_lexer_reset(&self->lexer, error_end_position); + result = ts_tree_make_error(size, padding, first_error_character); } else { - result = ts_tree_make_leaf( - lex_result.symbol, lex_result.padding, lex_result.size, - ts_language_symbol_metadata(self->language, lex_result.symbol)); - if (!result) - return NULL; - result->parse_state = parse_state; + TSSymbol symbol = self->lexer.result_symbol; + TSLength padding = ts_length_sub(self->lexer.token_start_position, start_position); + TSLength size = ts_length_sub(self->lexer.current_position, self->lexer.token_start_position); + result = ts_tree_make_leaf(symbol, padding, size, + ts_language_symbol_metadata(self->language, symbol)); } - result->first_leaf.lex_state = state; + if (!result) + return NULL; + result->parse_state = parse_state; + result->first_leaf.lex_state = start_state; return result; } @@ -333,8 +363,7 @@ static TSTree *parser__get_lookahead(Parser *self, StackVersion version, ts_lexer_reset(&self->lexer, position); TSStateId parse_state = ts_stack_top_state(self->stack, version); - bool error_mode = parse_state == TS_STATE_ERROR; - return parser__lex(self, parse_state, error_mode); + return parser__lex(self, parse_state); error: return NULL; diff --git a/src/runtime/tree.c b/src/runtime/tree.c index 76e0b6a8..777649e3 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -369,6 +369,15 @@ void ts_tree_edit(TSTree *self, TSInputEdit edit) { } } +static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) { + if (c == 0) + return snprintf(s, n, "EOF"); + else if (c < 128) + return snprintf(s, n, "'%c'", c); + else + return snprintf(s, n, "%d", c); +} + static size_t ts_tree__write_to_string(const TSTree *self, const TSLanguage *language, char *string, size_t limit, bool is_root, @@ -386,8 +395,8 @@ static size_t ts_tree__write_to_string(const TSTree *self, if (visible) { if (self->symbol == ts_builtin_sym_error && self->child_count == 0 && self->size.chars > 0) { - cursor += - snprintf(*writer, limit, "(UNEXPECTED '%c'", self->lookahead_char); + cursor += snprintf(*writer, limit, "(UNEXPECTED "); + cursor += ts_tree__write_char_to_string(*writer, limit, self->lookahead_char); } else { cursor += snprintf(*writer, limit, "(%s", ts_language_symbol_name(language, self->symbol)); diff --git a/src/runtime/tree.h b/src/runtime/tree.h index d147b4ed..7121b2b5 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -25,7 +25,7 @@ typedef struct TSTree { size_t named_child_count; union { struct TSTree **children; - char lookahead_char; + int32_t lookahead_char; }; TSLength padding;