diff --git a/cli/src/fuzz/mod.rs b/cli/src/fuzz/mod.rs index 85e219cc..efa07d3b 100644 --- a/cli/src/fuzz/mod.rs +++ b/cli/src/fuzz/mod.rs @@ -56,7 +56,9 @@ fn regex_env_var(name: &'static str) -> Option { pub fn new_seed() -> usize { int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| { let mut rng = rand::thread_rng(); - rng.gen::() + let seed = rng.gen::(); + eprintln!("Seed: {seed}"); + seed }) } diff --git a/cli/src/tests/corpus_test.rs b/cli/src/tests/corpus_test.rs index 750bf442..9b0ea8ba 100644 --- a/cli/src/tests/corpus_test.rs +++ b/cli/src/tests/corpus_test.rs @@ -23,7 +23,7 @@ use crate::{ }; #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_bash(seed: usize) { +fn test_corpus_for_bash_language(seed: usize) { test_language_corpus( "bash", seed, @@ -39,73 +39,77 @@ fn test_corpus_for_bash(seed: usize) { } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_c(seed: usize) { +fn test_corpus_for_c_language(seed: usize) { test_language_corpus("c", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_cpp(seed: usize) { +fn test_corpus_for_cpp_language(seed: usize) { test_language_corpus("cpp", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_embedded_template(seed: usize) { +fn test_corpus_for_embedded_template_language(seed: usize) { test_language_corpus("embedded-template", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_go(seed: usize) { +fn test_corpus_for_go_language(seed: usize) { test_language_corpus("go", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_html(seed: usize) { +fn test_corpus_for_html_language(seed: usize) { test_language_corpus("html", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_java(seed: usize) { - test_language_corpus("java", seed, None, None); +fn test_corpus_for_java_language(seed: usize) { + test_language_corpus( + "java", + seed, + Some(&["java - corpus - expressions - switch with unnamed pattern variable"]), + None, + ); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_javascript(seed: usize) { +fn test_corpus_for_javascript_language(seed: usize) { test_language_corpus("javascript", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_json(seed: usize) { +fn test_corpus_for_json_language(seed: usize) { test_language_corpus("json", seed, None, None); } -#[ignore] #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_php(seed: usize) { - test_language_corpus("php", seed, None, None); +fn test_corpus_for_php_language(seed: usize) { + test_language_corpus("php", seed, None, Some("php")); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_python(seed: usize) { +fn test_corpus_for_python_language(seed: usize) { test_language_corpus("python", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_ruby(seed: usize) { +fn test_corpus_for_ruby_language(seed: usize) { test_language_corpus("ruby", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_rust(seed: usize) { +fn test_corpus_for_rust_language(seed: usize) { test_language_corpus("rust", seed, None, None); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_typescript(seed: usize) { +fn test_corpus_for_typescript_language(seed: usize) { test_language_corpus("typescript", seed, None, Some("typescript")); } #[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)] -fn test_corpus_for_tsx(seed: usize) { +fn test_corpus_for_tsx_language(seed: usize) { test_language_corpus("typescript", seed, None, Some("tsx")); } diff --git a/lib/src/get_changed_ranges.c b/lib/src/get_changed_ranges.c index 8ca5bab3..b8130a12 100644 --- a/lib/src/get_changed_ranges.c +++ b/lib/src/get_changed_ranges.c @@ -108,6 +108,7 @@ typedef struct { const TSLanguage *language; unsigned visible_depth; bool in_padding; + Subtree prev_external_token; } Iterator; static Iterator iterator_new( @@ -127,6 +128,7 @@ static Iterator iterator_new( .language = language, .visible_depth = 1, .in_padding = false, + .prev_external_token = NULL_SUBTREE, }; } @@ -244,6 +246,10 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) { position = child_right; if (!ts_subtree_extra(*child)) structural_child_index++; + Subtree last_external_token = ts_subtree_last_external_token(*child); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } } } while (did_descend); @@ -268,6 +274,10 @@ static void iterator_advance(Iterator *self) { const Subtree *parent = array_back(&self->cursor.stack)->subtree; uint32_t child_index = entry.child_index + 1; + Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } if (ts_subtree_child_count(*parent) > child_index) { Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); uint32_t structural_child_index = entry.structural_child_index; @@ -313,29 +323,41 @@ static IteratorComparison iterator_compare( TSSymbol new_alias_symbol = 0; iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); + TSSymbol old_symbol = ts_subtree_symbol(old_tree); + TSSymbol new_symbol = ts_subtree_symbol(new_tree); if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; + if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers; + + uint32_t old_size = ts_subtree_size(old_tree).bytes; + uint32_t new_size = ts_subtree_size(new_tree).bytes; + TSStateId old_state = ts_subtree_parse_state(old_tree); + TSStateId new_state = ts_subtree_parse_state(new_tree); + bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree); + bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree); + uint32_t old_error_cost = ts_subtree_error_cost(old_tree); + uint32_t new_error_cost = ts_subtree_error_cost(new_tree); if ( - old_alias_symbol == new_alias_symbol && - ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) + old_start != new_start || + old_symbol == ts_builtin_sym_error || + old_size != new_size || + old_state == TS_TREE_STATE_NONE || + new_state == TS_TREE_STATE_NONE || + ((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) || + old_error_cost != new_error_cost || + old_has_external_tokens != new_has_external_tokens || + ts_subtree_has_changes(old_tree) || + ( + old_has_external_tokens && + !ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token) + ) ) { - if (old_start == new_start && - !ts_subtree_has_changes(old_tree) && - ts_subtree_symbol(old_tree) != ts_builtin_sym_error && - ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && - ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && - ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && - (ts_subtree_parse_state(old_tree) == ERROR_STATE) == - (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { - return IteratorMatches; - } else { - return IteratorMayDiffer; - } + return IteratorMayDiffer; } - return IteratorDiffers; + return IteratorMatches; } #ifdef DEBUG_GET_CHANGED_RANGES @@ -348,8 +370,8 @@ static inline void iterator_print_state(Iterator *self) { "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", name, self->in_padding ? "(p)" : " ", self->visible_depth, - start.row + 1, start.column, - end.row + 1, end.column + start.row, start.column, + end.row, end.column ); } #endif @@ -380,7 +402,7 @@ unsigned ts_subtree_get_changed_ranges( do { #ifdef DEBUG_GET_CHANGED_RANGES - printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); + printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column); iterator_print_state(&old_iter); printf("\tvs\t"); iterator_print_state(&new_iter); diff --git a/lib/src/parser.c b/lib/src/parser.c index a3d68592..adcdf8d0 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -949,6 +949,7 @@ static StackVersion ts_parser__reduce( // children. StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); uint32_t removed_version_count = 0; + uint32_t halted_version_count = ts_stack_halted_version_count(self->stack); for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; @@ -957,11 +958,12 @@ static StackVersion ts_parser__reduce( // will all be sorted and truncated at the end of the outer parsing loop. // Allow the maximum version count to be temporarily exceeded, but only // by a limited threshold. - if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW + halted_version_count) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; while (i + 1 < pop.size) { + LOG("aborting reduce with too many versions") StackSlice next_slice = pop.contents[i + 1]; if (next_slice.version != slice.version) break; ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); @@ -1318,10 +1320,23 @@ static void ts_parser__recover( // and subsequently halted. Remove those versions. for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { if (!ts_stack_is_active(self->stack, i)) { + LOG("removed paused version:%u", i); ts_stack_remove_version(self->stack, i--); + LOG_STACK(); } } + // If the parser is still in the error state at the end of the file, just wrap everything + // in an ERROR node and terminate. + if (ts_subtree_is_eof(lookahead)) { + LOG("recover_eof"); + SubtreeArray children = array_new(); + Subtree parent = ts_subtree_new_error_node(&children, false, self->language); + ts_stack_push(self->stack, version, parent, false, 1); + ts_parser__accept(self, version, lookahead); + return; + } + // If strategy 1 succeeded, a new stack version will have been created which is able to handle // the current lookahead token. Now, in addition, try strategy 2 described above: skip the // current lookahead token by wrapping it in an ERROR node. @@ -1342,17 +1357,6 @@ static void ts_parser__recover( return; } - // If the parser is still in the error state at the end of the file, just wrap everything - // in an ERROR node and terminate. - if (ts_subtree_is_eof(lookahead)) { - LOG("recover_eof"); - SubtreeArray children = array_new(); - Subtree parent = ts_subtree_new_error_node(&children, false, self->language); - ts_stack_push(self->stack, version, parent, false, 1); - ts_parser__accept(self, version, lookahead); - return; - } - // Do not recover if the result would clearly be worse than some existing stack version. unsigned new_cost = current_error_cost + ERROR_COST_PER_SKIPPED_TREE + @@ -1618,6 +1622,7 @@ static bool ts_parser__advance( // an ambiguous state. REDUCE actions always create a new stack // version, whereas SHIFT actions update the existing stack version // and terminate this loop. + bool did_reduce = false; StackVersion last_reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < table_entry.action_count; i++) { TSParseAction action = table_entry.actions[i]; @@ -1653,6 +1658,7 @@ static bool ts_parser__advance( action.reduce.dynamic_precedence, action.reduce.production_id, is_fragile, end_of_non_terminal_extra ); + did_reduce = true; if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; } @@ -1704,9 +1710,12 @@ static bool ts_parser__advance( continue; } - // A non-terminal extra rule was reduced and merged into an existing - // stack version. This version can be discarded. - if (!lookahead.ptr) { + // A reduction was performed, but was merged into an existing stack version. + // This version can be discarded. + if (did_reduce) { + if (lookahead.ptr) { + ts_subtree_release(&self->tree_pool, lookahead); + } ts_stack_halt(self->stack, version); return true; } @@ -1755,7 +1764,7 @@ static bool ts_parser__advance( // versions that exist. If some other version advances successfully, then // this version can simply be removed. But if all versions end up paused, // then error recovery is needed. - LOG("detect_error"); + LOG("detect_error lookahead:%s", TREE_NAME(lookahead)); ts_stack_pause(self->stack, version, lookahead); return true; } @@ -1844,6 +1853,7 @@ static unsigned ts_parser__condense_stack(TSParser *self) { has_unpaused_version = true; } else { ts_stack_remove_version(self->stack, i); + made_changes = true; i--; n--; } diff --git a/lib/src/stack.c b/lib/src/stack.c index f0d57108..453d9845 100644 --- a/lib/src/stack.c +++ b/lib/src/stack.c @@ -460,6 +460,17 @@ uint32_t ts_stack_version_count(const Stack *self) { return self->heads.size; } +uint32_t ts_stack_halted_version_count(Stack *self) { + uint32_t count = 0; + for (uint32_t i = 0; i < self->heads.size; i++) { + StackHead *head = array_get(&self->heads, i); + if (head->status == StackStatusHalted) { + count++; + } + } + return count; +} + TSStateId ts_stack_state(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->state; } @@ -524,6 +535,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c return stack__iter(self, version, pop_count_callback, &count, (int)count); } + forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { (void)payload; if (iterator->subtree_count >= 1) { diff --git a/lib/src/stack.h b/lib/src/stack.h index ac32234f..2619f1e8 100644 --- a/lib/src/stack.h +++ b/lib/src/stack.h @@ -36,6 +36,9 @@ void ts_stack_delete(Stack *self); // Get the stack's current number of versions. uint32_t ts_stack_version_count(const Stack *self); +// Get the stack's current number of halted versions. +uint32_t ts_stack_halted_version_count(Stack *self); + // Get the state at the top of the given version of the stack. If the stack is // empty, this returns the initial state, 0. TSStateId ts_stack_state(const Stack *self, StackVersion version); diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 888e7781..81e17ca7 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -304,8 +304,9 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint go } TreeCursorStep ts_tree_cursor_goto_sibling_internal( - TSTreeCursor *_self, - bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) { + TSTreeCursor *_self, + bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *) +) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index f3b67a79..ec8bc927 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -144,7 +144,7 @@ struct Test { iterations: Option, /// Set the seed used to control random behavior. #[arg(long, short)] - seed: Option, + seed: Option, /// Print parsing log to stderr. #[arg(long, short)] debug: bool,