From 0bdd9b640cb517cc1e36d15edf5a179ed283f2b2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Feb 2022 21:32:14 -0800 Subject: [PATCH 1/3] Store the lookahead subtree of paused stack versions, not just the lookahead symbol --- lib/src/parser.c | 16 ++++++++-------- lib/src/stack.c | 21 ++++++++++++--------- lib/src/stack.h | 4 ++-- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/lib/src/parser.c b/lib/src/parser.c index 9f17d0a7..8dfe580e 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1063,7 +1063,7 @@ static bool ts_parser__do_all_potential_reductions( static void ts_parser__handle_error( TSParser *self, StackVersion version, - TSSymbol lookahead_symbol + Subtree lookahead ) { uint32_t previous_version_count = ts_stack_version_count(self->stack); @@ -1093,7 +1093,7 @@ static void ts_parser__handle_error( if (ts_language_has_reduce_action( self->language, state_after_missing_symbol, - lookahead_symbol + ts_subtree_leaf_symbol(lookahead) )) { // In case the parser is currently outside of any included range, the lexer will // snap to the beginning of the next included range. The missing token's padding @@ -1114,7 +1114,7 @@ static void ts_parser__handle_error( if (ts_parser__do_all_potential_reductions( self, version_with_missing_tree, - lookahead_symbol + ts_subtree_leaf_symbol(lookahead) )) { LOG( "recover_with_missing symbol:%s, state:%u", @@ -1138,6 +1138,7 @@ static void ts_parser__handle_error( } ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); + ts_subtree_release(&self->tree_pool, lookahead); LOG_STACK(); } @@ -1523,7 +1524,7 @@ static bool ts_parser__advance( } if (!lookahead.ptr) { - ts_stack_pause(self->stack, version, ts_builtin_sym_end); + ts_stack_pause(self->stack, version, lookahead); return true; } @@ -1576,8 +1577,7 @@ static bool ts_parser__advance( // version advances successfully, then this version can simply be removed. // But if all versions end up paused, then error recovery is needed. LOG("detect_error"); - ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead)); - ts_subtree_release(&self->tree_pool, lookahead); + ts_stack_pause(self->stack, version, lookahead); return true; } } @@ -1660,8 +1660,8 @@ static unsigned ts_parser__condense_stack(TSParser *self) { if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) { LOG("resume version:%u", i); min_error_cost = ts_stack_error_cost(self->stack, i); - TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i); - ts_parser__handle_error(self, i, lookahead_symbol); + Subtree lookahead = ts_stack_resume(self->stack, i); + ts_parser__handle_error(self, i, lookahead); has_unpaused_version = true; } else { ts_stack_remove_version(self->stack, i); diff --git a/lib/src/stack.c b/lib/src/stack.c index 1dc6895f..2a11abd8 100644 --- a/lib/src/stack.c +++ b/lib/src/stack.c @@ -53,10 +53,10 @@ typedef enum { typedef struct { StackNode *node; - Subtree last_external_token; StackSummary *summary; unsigned node_count_at_last_error; - TSSymbol lookahead_when_paused; + Subtree last_external_token; + Subtree lookahead_when_paused; StackStatus status; } StackHead; @@ -256,6 +256,9 @@ static void stack_head_delete( if (self->last_external_token.ptr) { ts_subtree_release(subtree_pool, self->last_external_token); } + if (self->lookahead_when_paused.ptr) { + ts_subtree_release(subtree_pool, self->lookahead_when_paused); + } if (self->summary) { array_delete(self->summary); ts_free(self->summary); @@ -274,7 +277,7 @@ static StackVersion ts_stack__add_version( .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, .last_external_token = self->heads.contents[original_version].last_external_token, .status = StackStatusActive, - .lookahead_when_paused = 0, + .lookahead_when_paused = NULL_SUBTREE, }; array_push(&self->heads, head); stack_node_retain(node); @@ -703,7 +706,7 @@ void ts_stack_halt(Stack *self, StackVersion version) { array_get(&self->heads, version)->status = StackStatusHalted; } -void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) { +void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead) { StackHead *head = array_get(&self->heads, version); head->status = StackStatusPaused; head->lookahead_when_paused = lookahead; @@ -722,12 +725,12 @@ bool ts_stack_is_paused(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->status == StackStatusPaused; } -TSSymbol ts_stack_resume(Stack *self, StackVersion version) { +Subtree ts_stack_resume(Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); assert(head->status == StackStatusPaused); - TSSymbol result = head->lookahead_when_paused; + Subtree result = head->lookahead_when_paused; head->status = StackStatusActive; - head->lookahead_when_paused = 0; + head->lookahead_when_paused = NULL_SUBTREE; return result; } @@ -739,9 +742,9 @@ void ts_stack_clear(Stack *self) { array_clear(&self->heads); array_push(&self->heads, ((StackHead) { .node = self->base_node, - .last_external_token = NULL_SUBTREE, .status = StackStatusActive, - .lookahead_when_paused = 0, + .last_external_token = NULL_SUBTREE, + .lookahead_when_paused = NULL_SUBTREE, })); } diff --git a/lib/src/stack.h b/lib/src/stack.h index 10d7c24c..86abbc9d 100644 --- a/lib/src/stack.h +++ b/lib/src/stack.h @@ -99,9 +99,9 @@ bool ts_stack_merge(Stack *, StackVersion, StackVersion); // Determine whether the given two stack versions can be merged. bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); -TSSymbol ts_stack_resume(Stack *, StackVersion); +Subtree ts_stack_resume(Stack *, StackVersion); -void ts_stack_pause(Stack *, StackVersion, TSSymbol); +void ts_stack_pause(Stack *, StackVersion, Subtree); void ts_stack_halt(Stack *, StackVersion); From 0fb864c1a0a5a53a951e3bf830fdbdc5edac385d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 14 Feb 2022 22:39:52 -0800 Subject: [PATCH 2/3] Retain information about the lexer's lookahead for the token where an error was detected --- lib/src/parser.c | 193 ++++++++++++------------ test/fixtures/error_corpus/c_errors.txt | 4 +- 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/lib/src/parser.c b/lib/src/parser.c index 8dfe580e..20cf36ff 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1060,88 +1060,6 @@ static bool ts_parser__do_all_potential_reductions( return can_shift_lookahead_symbol; } -static void ts_parser__handle_error( - TSParser *self, - StackVersion version, - Subtree lookahead -) { - uint32_t previous_version_count = ts_stack_version_count(self->stack); - - // Perform any reductions that can happen in this state, regardless of the lookahead. After - // skipping one or more invalid tokens, the parser might find a token that would have allowed - // a reduction to take place. - ts_parser__do_all_potential_reductions(self, version, 0); - uint32_t version_count = ts_stack_version_count(self->stack); - Length position = ts_stack_position(self->stack, version); - - // Push a discontinuity onto the stack. Merge all of the stack versions that - // were created in the previous step. - bool did_insert_missing_token = false; - for (StackVersion v = version; v < version_count;) { - if (!did_insert_missing_token) { - TSStateId state = ts_stack_state(self->stack, v); - for (TSSymbol missing_symbol = 1; - missing_symbol < self->language->token_count; - missing_symbol++) { - TSStateId state_after_missing_symbol = ts_language_next_state( - self->language, state, missing_symbol - ); - if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { - continue; - } - - if (ts_language_has_reduce_action( - self->language, - state_after_missing_symbol, - ts_subtree_leaf_symbol(lookahead) - )) { - // In case the parser is currently outside of any included range, the lexer will - // snap to the beginning of the next included range. The missing token's padding - // must be assigned to position it within the next included range. - ts_lexer_reset(&self->lexer, position); - ts_lexer_mark_end(&self->lexer); - Length padding = length_sub(self->lexer.token_end_position, position); - - StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); - Subtree missing_tree = ts_subtree_new_missing_leaf( - &self->tree_pool, missing_symbol, padding, self->language - ); - ts_stack_push( - self->stack, version_with_missing_tree, - missing_tree, false, - state_after_missing_symbol - ); - - if (ts_parser__do_all_potential_reductions( - self, version_with_missing_tree, - ts_subtree_leaf_symbol(lookahead) - )) { - LOG( - "recover_with_missing symbol:%s, state:%u", - SYM_NAME(missing_symbol), - ts_stack_state(self->stack, version_with_missing_tree) - ); - did_insert_missing_token = true; - break; - } - } - } - } - - ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); - v = (v == version) ? previous_version_count : v + 1; - } - - for (unsigned i = previous_version_count; i < version_count; i++) { - bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); - assert(did_merge); - } - - ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); - ts_subtree_release(&self->tree_pool, lookahead); - LOG_STACK(); -} - static bool ts_parser__recover_to_state( TSParser *self, StackVersion version, @@ -1369,6 +1287,98 @@ static void ts_parser__recover( } } +static void ts_parser__handle_error( + TSParser *self, + StackVersion version, + Subtree lookahead +) { + uint32_t previous_version_count = ts_stack_version_count(self->stack); + + // Perform any reductions that can happen in this state, regardless of the lookahead. After + // skipping one or more invalid tokens, the parser might find a token that would have allowed + // a reduction to take place. + ts_parser__do_all_potential_reductions(self, version, 0); + uint32_t version_count = ts_stack_version_count(self->stack); + Length position = ts_stack_position(self->stack, version); + + // Push a discontinuity onto the stack. Merge all of the stack versions that + // were created in the previous step. + bool did_insert_missing_token = false; + for (StackVersion v = version; v < version_count;) { + if (!did_insert_missing_token) { + TSStateId state = ts_stack_state(self->stack, v); + for (TSSymbol missing_symbol = 1; + missing_symbol < self->language->token_count; + missing_symbol++) { + TSStateId state_after_missing_symbol = ts_language_next_state( + self->language, state, missing_symbol + ); + if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { + continue; + } + + if (ts_language_has_reduce_action( + self->language, + state_after_missing_symbol, + ts_subtree_leaf_symbol(lookahead) + )) { + // In case the parser is currently outside of any included range, the lexer will + // snap to the beginning of the next included range. The missing token's padding + // must be assigned to position it within the next included range. + ts_lexer_reset(&self->lexer, position); + ts_lexer_mark_end(&self->lexer); + Length padding = length_sub(self->lexer.token_end_position, position); + + StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); + Subtree missing_tree = ts_subtree_new_missing_leaf( + &self->tree_pool, missing_symbol, padding, self->language + ); + ts_stack_push( + self->stack, version_with_missing_tree, + missing_tree, false, + state_after_missing_symbol + ); + + if (ts_parser__do_all_potential_reductions( + self, version_with_missing_tree, + ts_subtree_leaf_symbol(lookahead) + )) { + LOG( + "recover_with_missing symbol:%s, state:%u", + SYM_NAME(missing_symbol), + ts_stack_state(self->stack, version_with_missing_tree) + ); + did_insert_missing_token = true; + break; + } + } + } + } + + ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); + v = (v == version) ? previous_version_count : v + 1; + } + + for (unsigned i = previous_version_count; i < version_count; i++) { + bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); + assert(did_merge); + } + + ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); + + // Begin recovery with the current lookahead node, rather than waiting for the + // next turn of the parse loop. This ensures that the tree accounts for the the + // current lookahead token's "lookahead bytes" value, which describes how far + // the lexer needed to look ahead beyond the content of the token in order to + // recognize it. + if (ts_subtree_child_count(lookahead) > 0) { + ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node); + } + ts_parser__recover(self, version, lookahead); + + LOG_STACK(); +} + static bool ts_parser__advance( TSParser *self, StackVersion version, @@ -1511,23 +1521,18 @@ static bool ts_parser__advance( // on the current parse state. if (!lookahead.ptr) { needs_lex = true; - continue; + } else { + ts_language_table_entry( + self->language, + state, + ts_subtree_leaf_symbol(lookahead), + &table_entry + ); } - ts_language_table_entry( - self->language, - state, - ts_subtree_leaf_symbol(lookahead), - &table_entry - ); continue; } - if (!lookahead.ptr) { - ts_stack_pause(self->stack, version, lookahead); - return true; - } - // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt index b8733245..97c75f0c 100644 --- a/test/fixtures/error_corpus/c_errors.txt +++ b/test/fixtures/error_corpus/c_errors.txt @@ -128,8 +128,8 @@ int main() { (declaration (primitive_type) (init_declarator (identifier) (parenthesized_expression - (number_literal) - (ERROR (number_literal)))))))) + (ERROR (number_literal)) + (number_literal))))))) ======================================== Extra identifiers in declarations From c697ebfb278d4a7136328f25b27d0d5c458ae166 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 26 Feb 2022 14:31:33 -0800 Subject: [PATCH 3/3] Add explicit unit test for error detection lookahead bug --- cli/src/tests/parser_test.rs | 51 +++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index c02e7620..1a962e29 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,12 +1,17 @@ use super::helpers::{ allocations, + edits::invert_edit, edits::ReadRecorder, fixtures::{get_language, get_test_grammar, get_test_language}, }; -use crate::generate::generate_parser_for_grammar; -use crate::parse::{perform_edit, Edit}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{thread, time}; +use crate::{ + generate::generate_parser_for_grammar, + parse::{perform_edit, Edit}, +}; +use std::{ + sync::atomic::{AtomicUsize, Ordering}, + thread, time, +}; use tree_sitter::{IncludedRangesError, InputEdit, LogType, Parser, Point, Range}; #[test] @@ -491,6 +496,44 @@ h + i ); } +#[test] +fn test_parsing_after_detecting_error_in_the_middle_of_a_string_token() { + let mut parser = Parser::new(); + parser.set_language(get_language("python")).unwrap(); + + let mut source = b"a = b, 'c, d'".to_vec(); + let tree = parser.parse(&source, None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(module (expression_statement (assignment left: (identifier) right: (expression_list (identifier) (string)))))" + ); + + // Delete a suffix of the source code, starting in the middle of the string + // literal, after some whitespace. With this deletion, the remaining string + // content: "c, " looks like two valid python tokens: an identifier and a comma. + // When this edit is undone, in order correctly recover the orginal tree, the + // parser needs to remember that before matching the `c` as an identifier, it + // lookahead ahead several bytes, trying to find the closing quotation mark in + // order to match the "string content" node. + let edit_ix = std::str::from_utf8(&source).unwrap().find("d'").unwrap(); + let edit = Edit { + position: edit_ix, + deleted_length: source.len() - edit_ix, + inserted_text: Vec::new(), + }; + let undo = invert_edit(&source, &edit); + + let mut tree2 = tree.clone(); + perform_edit(&mut tree2, &mut source, &edit); + tree2 = parser.parse(&source, Some(&tree2)).unwrap(); + assert!(tree2.root_node().has_error()); + + let mut tree3 = tree2.clone(); + perform_edit(&mut tree3, &mut source, &undo); + tree3 = parser.parse(&source, Some(&tree3)).unwrap(); + assert_eq!(tree3.root_node().to_sexp(), tree.root_node().to_sexp(),); +} + // Thread safety #[test]