Fix cases where error recovery could infinite loop (#4257)

* Rename corpus test functions to allow easy filtering by language

* Use usize for seed argument

* Avoid retaining useless stack versions when reductions merge

We found this problem when debugging an infinite loop that happened
during error recovery when using the Zig grammar. The large number of
unnecessary paused stack versions were preventing the correct recovery
strategy from being tried.

* Fix leaked lookahead token when reduction results in a merged stack

* Enable running PHP tests in CI

* Fix possible infinite loop during error recovery at EOF

* Account for external scanner state changes when detecting changed ranges in subtrees
This commit is contained in:
Max Brunsfeld 2025-03-04 13:50:56 -08:00 committed by GitHub
parent 8138dba800
commit 066fd77d39
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 110 additions and 56 deletions

View file

@ -56,7 +56,9 @@ fn regex_env_var(name: &'static str) -> Option<Regex> {
pub fn new_seed() -> usize {
int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| {
let mut rng = rand::thread_rng();
rng.gen::<usize>()
let seed = rng.gen::<usize>();
eprintln!("Seed: {seed}");
seed
})
}

View file

@ -23,7 +23,7 @@ use crate::{
};
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_bash(seed: usize) {
fn test_corpus_for_bash_language(seed: usize) {
test_language_corpus(
"bash",
seed,
@ -39,73 +39,77 @@ fn test_corpus_for_bash(seed: usize) {
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_c(seed: usize) {
fn test_corpus_for_c_language(seed: usize) {
test_language_corpus("c", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_cpp(seed: usize) {
fn test_corpus_for_cpp_language(seed: usize) {
test_language_corpus("cpp", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_embedded_template(seed: usize) {
fn test_corpus_for_embedded_template_language(seed: usize) {
test_language_corpus("embedded-template", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_go(seed: usize) {
fn test_corpus_for_go_language(seed: usize) {
test_language_corpus("go", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_html(seed: usize) {
fn test_corpus_for_html_language(seed: usize) {
test_language_corpus("html", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_java(seed: usize) {
test_language_corpus("java", seed, None, None);
fn test_corpus_for_java_language(seed: usize) {
test_language_corpus(
"java",
seed,
Some(&["java - corpus - expressions - switch with unnamed pattern variable"]),
None,
);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_javascript(seed: usize) {
fn test_corpus_for_javascript_language(seed: usize) {
test_language_corpus("javascript", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_json(seed: usize) {
fn test_corpus_for_json_language(seed: usize) {
test_language_corpus("json", seed, None, None);
}
#[ignore]
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_php(seed: usize) {
test_language_corpus("php", seed, None, None);
fn test_corpus_for_php_language(seed: usize) {
test_language_corpus("php", seed, None, Some("php"));
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_python(seed: usize) {
fn test_corpus_for_python_language(seed: usize) {
test_language_corpus("python", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_ruby(seed: usize) {
fn test_corpus_for_ruby_language(seed: usize) {
test_language_corpus("ruby", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_rust(seed: usize) {
fn test_corpus_for_rust_language(seed: usize) {
test_language_corpus("rust", seed, None, None);
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_typescript(seed: usize) {
fn test_corpus_for_typescript_language(seed: usize) {
test_language_corpus("typescript", seed, None, Some("typescript"));
}
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
fn test_corpus_for_tsx(seed: usize) {
fn test_corpus_for_tsx_language(seed: usize) {
test_language_corpus("typescript", seed, None, Some("tsx"));
}

View file

@ -108,6 +108,7 @@ typedef struct {
const TSLanguage *language;
unsigned visible_depth;
bool in_padding;
Subtree prev_external_token;
} Iterator;
static Iterator iterator_new(
@ -127,6 +128,7 @@ static Iterator iterator_new(
.language = language,
.visible_depth = 1,
.in_padding = false,
.prev_external_token = NULL_SUBTREE,
};
}
@ -244,6 +246,10 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) {
position = child_right;
if (!ts_subtree_extra(*child)) structural_child_index++;
Subtree last_external_token = ts_subtree_last_external_token(*child);
if (last_external_token.ptr) {
self->prev_external_token = last_external_token;
}
}
} while (did_descend);
@ -268,6 +274,10 @@ static void iterator_advance(Iterator *self) {
const Subtree *parent = array_back(&self->cursor.stack)->subtree;
uint32_t child_index = entry.child_index + 1;
Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree);
if (last_external_token.ptr) {
self->prev_external_token = last_external_token;
}
if (ts_subtree_child_count(*parent) > child_index) {
Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree));
uint32_t structural_child_index = entry.structural_child_index;
@ -313,29 +323,41 @@ static IteratorComparison iterator_compare(
TSSymbol new_alias_symbol = 0;
iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start);
iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start);
TSSymbol old_symbol = ts_subtree_symbol(old_tree);
TSSymbol new_symbol = ts_subtree_symbol(new_tree);
if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches;
if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers;
if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers;
uint32_t old_size = ts_subtree_size(old_tree).bytes;
uint32_t new_size = ts_subtree_size(new_tree).bytes;
TSStateId old_state = ts_subtree_parse_state(old_tree);
TSStateId new_state = ts_subtree_parse_state(new_tree);
bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree);
bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree);
uint32_t old_error_cost = ts_subtree_error_cost(old_tree);
uint32_t new_error_cost = ts_subtree_error_cost(new_tree);
if (
old_alias_symbol == new_alias_symbol &&
ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree)
old_start != new_start ||
old_symbol == ts_builtin_sym_error ||
old_size != new_size ||
old_state == TS_TREE_STATE_NONE ||
new_state == TS_TREE_STATE_NONE ||
((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) ||
old_error_cost != new_error_cost ||
old_has_external_tokens != new_has_external_tokens ||
ts_subtree_has_changes(old_tree) ||
(
old_has_external_tokens &&
!ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token)
)
) {
if (old_start == new_start &&
!ts_subtree_has_changes(old_tree) &&
ts_subtree_symbol(old_tree) != ts_builtin_sym_error &&
ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes &&
ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE &&
ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE &&
(ts_subtree_parse_state(old_tree) == ERROR_STATE) ==
(ts_subtree_parse_state(new_tree) == ERROR_STATE)) {
return IteratorMatches;
} else {
return IteratorMayDiffer;
}
return IteratorMayDiffer;
}
return IteratorDiffers;
return IteratorMatches;
}
#ifdef DEBUG_GET_CHANGED_RANGES
@ -348,8 +370,8 @@ static inline void iterator_print_state(Iterator *self) {
"(%-25s %s\t depth:%u [%u, %u] - [%u, %u])",
name, self->in_padding ? "(p)" : " ",
self->visible_depth,
start.row + 1, start.column,
end.row + 1, end.column
start.row, start.column,
end.row, end.column
);
}
#endif
@ -380,7 +402,7 @@ unsigned ts_subtree_get_changed_ranges(
do {
#ifdef DEBUG_GET_CHANGED_RANGES
printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column);
printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column);
iterator_print_state(&old_iter);
printf("\tvs\t");
iterator_print_state(&new_iter);

View file

@ -949,6 +949,7 @@ static StackVersion ts_parser__reduce(
// children.
StackSliceArray pop = ts_stack_pop_count(self->stack, version, count);
uint32_t removed_version_count = 0;
uint32_t halted_version_count = ts_stack_halted_version_count(self->stack);
for (uint32_t i = 0; i < pop.size; i++) {
StackSlice slice = pop.contents[i];
StackVersion slice_version = slice.version - removed_version_count;
@ -957,11 +958,12 @@ static StackVersion ts_parser__reduce(
// will all be sorted and truncated at the end of the outer parsing loop.
// Allow the maximum version count to be temporarily exceeded, but only
// by a limited threshold.
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) {
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW + halted_version_count) {
ts_stack_remove_version(self->stack, slice_version);
ts_subtree_array_delete(&self->tree_pool, &slice.subtrees);
removed_version_count++;
while (i + 1 < pop.size) {
LOG("aborting reduce with too many versions")
StackSlice next_slice = pop.contents[i + 1];
if (next_slice.version != slice.version) break;
ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees);
@ -1318,10 +1320,23 @@ static void ts_parser__recover(
// and subsequently halted. Remove those versions.
for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) {
if (!ts_stack_is_active(self->stack, i)) {
LOG("removed paused version:%u", i);
ts_stack_remove_version(self->stack, i--);
LOG_STACK();
}
}
// If the parser is still in the error state at the end of the file, just wrap everything
// in an ERROR node and terminate.
if (ts_subtree_is_eof(lookahead)) {
LOG("recover_eof");
SubtreeArray children = array_new();
Subtree parent = ts_subtree_new_error_node(&children, false, self->language);
ts_stack_push(self->stack, version, parent, false, 1);
ts_parser__accept(self, version, lookahead);
return;
}
// If strategy 1 succeeded, a new stack version will have been created which is able to handle
// the current lookahead token. Now, in addition, try strategy 2 described above: skip the
// current lookahead token by wrapping it in an ERROR node.
@ -1342,17 +1357,6 @@ static void ts_parser__recover(
return;
}
// If the parser is still in the error state at the end of the file, just wrap everything
// in an ERROR node and terminate.
if (ts_subtree_is_eof(lookahead)) {
LOG("recover_eof");
SubtreeArray children = array_new();
Subtree parent = ts_subtree_new_error_node(&children, false, self->language);
ts_stack_push(self->stack, version, parent, false, 1);
ts_parser__accept(self, version, lookahead);
return;
}
// Do not recover if the result would clearly be worse than some existing stack version.
unsigned new_cost =
current_error_cost + ERROR_COST_PER_SKIPPED_TREE +
@ -1618,6 +1622,7 @@ static bool ts_parser__advance(
// an ambiguous state. REDUCE actions always create a new stack
// version, whereas SHIFT actions update the existing stack version
// and terminate this loop.
bool did_reduce = false;
StackVersion last_reduction_version = STACK_VERSION_NONE;
for (uint32_t i = 0; i < table_entry.action_count; i++) {
TSParseAction action = table_entry.actions[i];
@ -1653,6 +1658,7 @@ static bool ts_parser__advance(
action.reduce.dynamic_precedence, action.reduce.production_id,
is_fragile, end_of_non_terminal_extra
);
did_reduce = true;
if (reduction_version != STACK_VERSION_NONE) {
last_reduction_version = reduction_version;
}
@ -1704,9 +1710,12 @@ static bool ts_parser__advance(
continue;
}
// A non-terminal extra rule was reduced and merged into an existing
// stack version. This version can be discarded.
if (!lookahead.ptr) {
// A reduction was performed, but was merged into an existing stack version.
// This version can be discarded.
if (did_reduce) {
if (lookahead.ptr) {
ts_subtree_release(&self->tree_pool, lookahead);
}
ts_stack_halt(self->stack, version);
return true;
}
@ -1755,7 +1764,7 @@ static bool ts_parser__advance(
// versions that exist. If some other version advances successfully, then
// this version can simply be removed. But if all versions end up paused,
// then error recovery is needed.
LOG("detect_error");
LOG("detect_error lookahead:%s", TREE_NAME(lookahead));
ts_stack_pause(self->stack, version, lookahead);
return true;
}
@ -1844,6 +1853,7 @@ static unsigned ts_parser__condense_stack(TSParser *self) {
has_unpaused_version = true;
} else {
ts_stack_remove_version(self->stack, i);
made_changes = true;
i--;
n--;
}

View file

@ -460,6 +460,17 @@ uint32_t ts_stack_version_count(const Stack *self) {
return self->heads.size;
}
uint32_t ts_stack_halted_version_count(Stack *self) {
uint32_t count = 0;
for (uint32_t i = 0; i < self->heads.size; i++) {
StackHead *head = array_get(&self->heads, i);
if (head->status == StackStatusHalted) {
count++;
}
}
return count;
}
TSStateId ts_stack_state(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->state;
}
@ -524,6 +535,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c
return stack__iter(self, version, pop_count_callback, &count, (int)count);
}
forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) {
(void)payload;
if (iterator->subtree_count >= 1) {

View file

@ -36,6 +36,9 @@ void ts_stack_delete(Stack *self);
// Get the stack's current number of versions.
uint32_t ts_stack_version_count(const Stack *self);
// Get the stack's current number of halted versions.
uint32_t ts_stack_halted_version_count(Stack *self);
// Get the state at the top of the given version of the stack. If the stack is
// empty, this returns the initial state, 0.
TSStateId ts_stack_state(const Stack *self, StackVersion version);

View file

@ -304,8 +304,9 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint go
}
TreeCursorStep ts_tree_cursor_goto_sibling_internal(
TSTreeCursor *_self,
bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) {
TSTreeCursor *_self,
bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)
) {
TreeCursor *self = (TreeCursor *)_self;
uint32_t initial_size = self->stack.size;

View file

@ -144,7 +144,7 @@ struct Test {
iterations: Option<u32>,
/// Set the seed used to control random behavior.
#[arg(long, short)]
seed: Option<u32>,
seed: Option<usize>,
/// Print parsing log to stderr.
#[arg(long, short)]
debug: bool,