From 0fb864c1a0a5a53a951e3bf830fdbdc5edac385d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Feb 2022 22:39:52 -0800 Subject: [PATCH] Retain information about the lexer's lookahead for the token where an error was detected --- lib/src/parser.c | 193 ++++++++++++------------ test/fixtures/error_corpus/c_errors.txt | 4 +- 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/lib/src/parser.c b/lib/src/parser.c index 8dfe580e..20cf36ff 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1060,88 +1060,6 @@ static bool ts_parser__do_all_potential_reductions( return can_shift_lookahead_symbol; } -static void ts_parser__handle_error( - TSParser *self, - StackVersion version, - Subtree lookahead -) { - uint32_t previous_version_count = ts_stack_version_count(self->stack); - - // Perform any reductions that can happen in this state, regardless of the lookahead. After - // skipping one or more invalid tokens, the parser might find a token that would have allowed - // a reduction to take place. - ts_parser__do_all_potential_reductions(self, version, 0); - uint32_t version_count = ts_stack_version_count(self->stack); - Length position = ts_stack_position(self->stack, version); - - // Push a discontinuity onto the stack. Merge all of the stack versions that - // were created in the previous step. - bool did_insert_missing_token = false; - for (StackVersion v = version; v < version_count;) { - if (!did_insert_missing_token) { - TSStateId state = ts_stack_state(self->stack, v); - for (TSSymbol missing_symbol = 1; - missing_symbol < self->language->token_count; - missing_symbol++) { - TSStateId state_after_missing_symbol = ts_language_next_state( - self->language, state, missing_symbol - ); - if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { - continue; - } - - if (ts_language_has_reduce_action( - self->language, - state_after_missing_symbol, - ts_subtree_leaf_symbol(lookahead) - )) { - // In case the parser is currently outside of any included range, the lexer will - // snap to the beginning of the next included range. The missing token's padding - // must be assigned to position it within the next included range. - ts_lexer_reset(&self->lexer, position); - ts_lexer_mark_end(&self->lexer); - Length padding = length_sub(self->lexer.token_end_position, position); - - StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); - Subtree missing_tree = ts_subtree_new_missing_leaf( - &self->tree_pool, missing_symbol, padding, self->language - ); - ts_stack_push( - self->stack, version_with_missing_tree, - missing_tree, false, - state_after_missing_symbol - ); - - if (ts_parser__do_all_potential_reductions( - self, version_with_missing_tree, - ts_subtree_leaf_symbol(lookahead) - )) { - LOG( - "recover_with_missing symbol:%s, state:%u", - SYM_NAME(missing_symbol), - ts_stack_state(self->stack, version_with_missing_tree) - ); - did_insert_missing_token = true; - break; - } - } - } - } - - ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); - v = (v == version) ? previous_version_count : v + 1; - } - - for (unsigned i = previous_version_count; i < version_count; i++) { - bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); - assert(did_merge); - } - - ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); - ts_subtree_release(&self->tree_pool, lookahead); - LOG_STACK(); -} - static bool ts_parser__recover_to_state( TSParser *self, StackVersion version, @@ -1369,6 +1287,98 @@ static void ts_parser__recover( } } +static void ts_parser__handle_error( + TSParser *self, + StackVersion version, + Subtree lookahead +) { + uint32_t previous_version_count = ts_stack_version_count(self->stack); + + // Perform any reductions that can happen in this state, regardless of the lookahead. After + // skipping one or more invalid tokens, the parser might find a token that would have allowed + // a reduction to take place. + ts_parser__do_all_potential_reductions(self, version, 0); + uint32_t version_count = ts_stack_version_count(self->stack); + Length position = ts_stack_position(self->stack, version); + + // Push a discontinuity onto the stack. Merge all of the stack versions that + // were created in the previous step. + bool did_insert_missing_token = false; + for (StackVersion v = version; v < version_count;) { + if (!did_insert_missing_token) { + TSStateId state = ts_stack_state(self->stack, v); + for (TSSymbol missing_symbol = 1; + missing_symbol < self->language->token_count; + missing_symbol++) { + TSStateId state_after_missing_symbol = ts_language_next_state( + self->language, state, missing_symbol + ); + if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { + continue; + } + + if (ts_language_has_reduce_action( + self->language, + state_after_missing_symbol, + ts_subtree_leaf_symbol(lookahead) + )) { + // In case the parser is currently outside of any included range, the lexer will + // snap to the beginning of the next included range. The missing token's padding + // must be assigned to position it within the next included range. + ts_lexer_reset(&self->lexer, position); + ts_lexer_mark_end(&self->lexer); + Length padding = length_sub(self->lexer.token_end_position, position); + + StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); + Subtree missing_tree = ts_subtree_new_missing_leaf( + &self->tree_pool, missing_symbol, padding, self->language + ); + ts_stack_push( + self->stack, version_with_missing_tree, + missing_tree, false, + state_after_missing_symbol + ); + + if (ts_parser__do_all_potential_reductions( + self, version_with_missing_tree, + ts_subtree_leaf_symbol(lookahead) + )) { + LOG( + "recover_with_missing symbol:%s, state:%u", + SYM_NAME(missing_symbol), + ts_stack_state(self->stack, version_with_missing_tree) + ); + did_insert_missing_token = true; + break; + } + } + } + } + + ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); + v = (v == version) ? previous_version_count : v + 1; + } + + for (unsigned i = previous_version_count; i < version_count; i++) { + bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); + assert(did_merge); + } + + ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); + + // Begin recovery with the current lookahead node, rather than waiting for the + // next turn of the parse loop. This ensures that the tree accounts for the the + // current lookahead token's "lookahead bytes" value, which describes how far + // the lexer needed to look ahead beyond the content of the token in order to + // recognize it. + if (ts_subtree_child_count(lookahead) > 0) { + ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node); + } + ts_parser__recover(self, version, lookahead); + + LOG_STACK(); +} + static bool ts_parser__advance( TSParser *self, StackVersion version, @@ -1511,23 +1521,18 @@ static bool ts_parser__advance( // on the current parse state. if (!lookahead.ptr) { needs_lex = true; - continue; + } else { + ts_language_table_entry( + self->language, + state, + ts_subtree_leaf_symbol(lookahead), + &table_entry + ); } - ts_language_table_entry( - self->language, - state, - ts_subtree_leaf_symbol(lookahead), - &table_entry - ); continue; } - if (!lookahead.ptr) { - ts_stack_pause(self->stack, version, lookahead); - return true; - } - // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt index b8733245..97c75f0c 100644 --- a/test/fixtures/error_corpus/c_errors.txt +++ b/test/fixtures/error_corpus/c_errors.txt @@ -128,8 +128,8 @@ int main() { (declaration (primitive_type) (init_declarator (identifier) (parenthesized_expression - (number_literal) - (ERROR (number_literal)))))))) + (ERROR (number_literal)) + (number_literal))))))) ======================================== Extra identifiers in declarations