From 066fd77d392cb30c7b7f74e86c1bb87bf66606f7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 4 Mar 2025 13:50:56 -0800 Subject: [PATCH] Fix cases where error recovery could infinite loop (#4257) * Rename corpus test functions to allow easy filtering by language * Use usize for seed argument * Avoid retaining useless stack versions when reductions merge We found this problem when debugging an infinite loop that happened during error recovery when using the Zig grammar. The large number of unnecessary paused stack versions were preventing the correct recovery strategy from being tried. * Fix leaked lookahead token when reduction results in a merged stack * Enable running PHP tests in CI * Fix possible infinite loop during error recovery at EOF * Account for external scanner state changes when detecting changed ranges in subtrees --- cli/src/fuzz/mod.rs | 4 ++- cli/src/tests/corpus_test.rs | 40 ++++++++++++++----------- lib/src/get_changed_ranges.c | 58 +++++++++++++++++++++++++----------- lib/src/parser.c | 42 ++++++++++++++++---------- lib/src/stack.c | 12 ++++++++ lib/src/stack.h | 3 ++ lib/src/tree_cursor.c | 5 ++-- xtask/src/main.rs | 2 +- 8 files changed, 110 insertions(+), 56 deletions(-) diff --git a/cli/src/fuzz/mod.rs b/cli/src/fuzz/mod.rs index 85e219cc..efa07d3b 100644 --- a/cli/src/fuzz/mod.rs +++ b/cli/src/fuzz/mod.rs @@ -56,7 +56,9 @@ fn regex_env_var(name: &'static str) -> Option { pub fn new_seed() -> usize { int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| { let mut rng = rand::thread_rng(); - rng.gen::() + let seed = rng.gen::(); + eprintln!("Seed: {seed}"); + seed }) } diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index 750bf442..9b0ea8ba 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -23,7 +23,7 @@ use crate::{ }; #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_bash(seed: usize) { +fn test_corpus_for_bash_language(seed: usize) { test_language_corpus( "bash", seed, @@ -39,73 +39,77 @@ fn test_corpus_for_bash(seed: usize) { } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_c(seed: usize) { +fn test_corpus_for_c_language(seed: usize) { test_language_corpus("c", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_cpp(seed: usize) { +fn test_corpus_for_cpp_language(seed: usize) { test_language_corpus("cpp", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_embedded_template(seed: usize) { +fn test_corpus_for_embedded_template_language(seed: usize) { test_language_corpus("embedded-template", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_go(seed: usize) { +fn test_corpus_for_go_language(seed: usize) { test_language_corpus("go", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_html(seed: usize) { +fn test_corpus_for_html_language(seed: usize) { test_language_corpus("html", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_java(seed: usize) { - test_language_corpus("java", seed, None, None); +fn test_corpus_for_java_language(seed: usize) { + test_language_corpus( + "java", + seed, + Some(&["java - corpus - expressions - switch with unnamed pattern variable"]), + None, + ); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_javascript(seed: usize) { +fn test_corpus_for_javascript_language(seed: usize) { test_language_corpus("javascript", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_json(seed: usize) { +fn test_corpus_for_json_language(seed: usize) { test_language_corpus("json", seed, None, None); } -#[ignore] #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_php(seed: usize) { - test_language_corpus("php", seed, None, None); +fn test_corpus_for_php_language(seed: usize) { + test_language_corpus("php", seed, None, Some("php")); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_python(seed: usize) { +fn test_corpus_for_python_language(seed: usize) { test_language_corpus("python", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_ruby(seed: usize) { +fn test_corpus_for_ruby_language(seed: usize) { test_language_corpus("ruby", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_rust(seed: usize) { +fn test_corpus_for_rust_language(seed: usize) { test_language_corpus("rust", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_typescript(seed: usize) { +fn test_corpus_for_typescript_language(seed: usize) { test_language_corpus("typescript", seed, None, Some("typescript")); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_tsx(seed: usize) { +fn test_corpus_for_tsx_language(seed: usize) { test_language_corpus("typescript", seed, None, Some("tsx")); } diff --git a/lib/src/get_changed_ranges.c b/lib/src/get_changed_ranges.c index 8ca5bab3..b8130a12 100644 --- a/lib/src/get_changed_ranges.c +++ b/lib/src/get_changed_ranges.c @@ -108,6 +108,7 @@ typedef struct { const TSLanguage *language; unsigned visible_depth; bool in_padding; + Subtree prev_external_token; } Iterator; static Iterator iterator_new( @@ -127,6 +128,7 @@ static Iterator iterator_new( .language = language, .visible_depth = 1, .in_padding = false, + .prev_external_token = NULL_SUBTREE, }; } @@ -244,6 +246,10 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) { position = child_right; if (!ts_subtree_extra(*child)) structural_child_index++; + Subtree last_external_token = ts_subtree_last_external_token(*child); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } } } while (did_descend); @@ -268,6 +274,10 @@ static void iterator_advance(Iterator *self) { const Subtree *parent = array_back(&self->cursor.stack)->subtree; uint32_t child_index = entry.child_index + 1; + Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } if (ts_subtree_child_count(*parent) > child_index) { Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); uint32_t structural_child_index = entry.structural_child_index; @@ -313,29 +323,41 @@ static IteratorComparison iterator_compare( TSSymbol new_alias_symbol = 0; iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); + TSSymbol old_symbol = ts_subtree_symbol(old_tree); + TSSymbol new_symbol = ts_subtree_symbol(new_tree); if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; + if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers; + + uint32_t old_size = ts_subtree_size(old_tree).bytes; + uint32_t new_size = ts_subtree_size(new_tree).bytes; + TSStateId old_state = ts_subtree_parse_state(old_tree); + TSStateId new_state = ts_subtree_parse_state(new_tree); + bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree); + bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree); + uint32_t old_error_cost = ts_subtree_error_cost(old_tree); + uint32_t new_error_cost = ts_subtree_error_cost(new_tree); if ( - old_alias_symbol == new_alias_symbol && - ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) + old_start != new_start || + old_symbol == ts_builtin_sym_error || + old_size != new_size || + old_state == TS_TREE_STATE_NONE || + new_state == TS_TREE_STATE_NONE || + ((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) || + old_error_cost != new_error_cost || + old_has_external_tokens != new_has_external_tokens || + ts_subtree_has_changes(old_tree) || + ( + old_has_external_tokens && + !ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token) + ) ) { - if (old_start == new_start && - !ts_subtree_has_changes(old_tree) && - ts_subtree_symbol(old_tree) != ts_builtin_sym_error && - ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && - ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && - ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && - (ts_subtree_parse_state(old_tree) == ERROR_STATE) == - (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { - return IteratorMatches; - } else { - return IteratorMayDiffer; - } + return IteratorMayDiffer; } - return IteratorDiffers; + return IteratorMatches; } #ifdef DEBUG_GET_CHANGED_RANGES @@ -348,8 +370,8 @@ static inline void iterator_print_state(Iterator *self) { "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", name, self->in_padding ? "(p)" : " ", self->visible_depth, - start.row + 1, start.column, - end.row + 1, end.column + start.row, start.column, + end.row, end.column ); } #endif @@ -380,7 +402,7 @@ unsigned ts_subtree_get_changed_ranges( do { #ifdef DEBUG_GET_CHANGED_RANGES - printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); + printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column); iterator_print_state(&old_iter); printf("\tvs\t"); iterator_print_state(&new_iter); diff --git a/lib/src/parser.c b/lib/src/parser.c index a3d68592..adcdf8d0 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -949,6 +949,7 @@ static StackVersion ts_parser__reduce( // children. StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); uint32_t removed_version_count = 0; + uint32_t halted_version_count = ts_stack_halted_version_count(self->stack); for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; @@ -957,11 +958,12 @@ static StackVersion ts_parser__reduce( // will all be sorted and truncated at the end of the outer parsing loop. // Allow the maximum version count to be temporarily exceeded, but only // by a limited threshold. - if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW + halted_version_count) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; while (i + 1 < pop.size) { + LOG("aborting reduce with too many versions") StackSlice next_slice = pop.contents[i + 1]; if (next_slice.version != slice.version) break; ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); @@ -1318,10 +1320,23 @@ static void ts_parser__recover( // and subsequently halted. Remove those versions. for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { if (!ts_stack_is_active(self->stack, i)) { + LOG("removed paused version:%u", i); ts_stack_remove_version(self->stack, i--); + LOG_STACK(); } } + // If the parser is still in the error state at the end of the file, just wrap everything + // in an ERROR node and terminate. + if (ts_subtree_is_eof(lookahead)) { + LOG("recover_eof"); + SubtreeArray children = array_new(); + Subtree parent = ts_subtree_new_error_node(&children, false, self->language); + ts_stack_push(self->stack, version, parent, false, 1); + ts_parser__accept(self, version, lookahead); + return; + } + // If strategy 1 succeeded, a new stack version will have been created which is able to handle // the current lookahead token. Now, in addition, try strategy 2 described above: skip the // current lookahead token by wrapping it in an ERROR node. @@ -1342,17 +1357,6 @@ static void ts_parser__recover( return; } - // If the parser is still in the error state at the end of the file, just wrap everything - // in an ERROR node and terminate. - if (ts_subtree_is_eof(lookahead)) { - LOG("recover_eof"); - SubtreeArray children = array_new(); - Subtree parent = ts_subtree_new_error_node(&children, false, self->language); - ts_stack_push(self->stack, version, parent, false, 1); - ts_parser__accept(self, version, lookahead); - return; - } - // Do not recover if the result would clearly be worse than some existing stack version. unsigned new_cost = current_error_cost + ERROR_COST_PER_SKIPPED_TREE + @@ -1618,6 +1622,7 @@ static bool ts_parser__advance( // an ambiguous state. REDUCE actions always create a new stack // version, whereas SHIFT actions update the existing stack version // and terminate this loop. + bool did_reduce = false; StackVersion last_reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < table_entry.action_count; i++) { TSParseAction action = table_entry.actions[i]; @@ -1653,6 +1658,7 @@ static bool ts_parser__advance( action.reduce.dynamic_precedence, action.reduce.production_id, is_fragile, end_of_non_terminal_extra ); + did_reduce = true; if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; } @@ -1704,9 +1710,12 @@ static bool ts_parser__advance( continue; } - // A non-terminal extra rule was reduced and merged into an existing - // stack version. This version can be discarded. - if (!lookahead.ptr) { + // A reduction was performed, but was merged into an existing stack version. + // This version can be discarded. + if (did_reduce) { + if (lookahead.ptr) { + ts_subtree_release(&self->tree_pool, lookahead); + } ts_stack_halt(self->stack, version); return true; } @@ -1755,7 +1764,7 @@ static bool ts_parser__advance( // versions that exist. If some other version advances successfully, then // this version can simply be removed. But if all versions end up paused, // then error recovery is needed. - LOG("detect_error"); + LOG("detect_error lookahead:%s", TREE_NAME(lookahead)); ts_stack_pause(self->stack, version, lookahead); return true; } @@ -1844,6 +1853,7 @@ static unsigned ts_parser__condense_stack(TSParser *self) { has_unpaused_version = true; } else { ts_stack_remove_version(self->stack, i); + made_changes = true; i--; n--; } diff --git a/lib/src/stack.c b/lib/src/stack.c index f0d57108..453d9845 100644 --- a/lib/src/stack.c +++ b/lib/src/stack.c @@ -460,6 +460,17 @@ uint32_t ts_stack_version_count(const Stack *self) { return self->heads.size; } +uint32_t ts_stack_halted_version_count(Stack *self) { + uint32_t count = 0; + for (uint32_t i = 0; i < self->heads.size; i++) { + StackHead *head = array_get(&self->heads, i); + if (head->status == StackStatusHalted) { + count++; + } + } + return count; +} + TSStateId ts_stack_state(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->state; } @@ -524,6 +535,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c return stack__iter(self, version, pop_count_callback, &count, (int)count); } + forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { (void)payload; if (iterator->subtree_count >= 1) { diff --git a/lib/src/stack.h b/lib/src/stack.h index ac32234f..2619f1e8 100644 --- a/lib/src/stack.h +++ b/lib/src/stack.h @@ -36,6 +36,9 @@ void ts_stack_delete(Stack *self); // Get the stack's current number of versions. uint32_t ts_stack_version_count(const Stack *self); +// Get the stack's current number of halted versions. +uint32_t ts_stack_halted_version_count(Stack *self); + // Get the state at the top of the given version of the stack. If the stack is // empty, this returns the initial state, 0. TSStateId ts_stack_state(const Stack *self, StackVersion version); diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 888e7781..81e17ca7 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -304,8 +304,9 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint go } TreeCursorStep ts_tree_cursor_goto_sibling_internal( - TSTreeCursor *_self, - bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) { + TSTreeCursor *_self, + bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *) +) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index f3b67a79..ec8bc927 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -144,7 +144,7 @@ struct Test { iterations: Option, /// Set the seed used to control random behavior. #[arg(long, short)] - seed: Option, + seed: Option, /// Print parsing log to stderr. #[arg(long, short)] debug: bool,