From ff2436a6f8639b290e4395ca2b44491472647a2b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 14 Feb 2023 14:41:25 -0800 Subject: [PATCH 1/9] Add --row-range, --quiet, and --time flags to query subcommand --- cli/src/main.rs | 30 ++++++++++++++++--- cli/src/query.rs | 77 +++++++++++++++++++++++++++++------------------- 2 files changed, 73 insertions(+), 34 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 47e7597b..fb2a6327 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -3,6 +3,7 @@ use clap::{App, AppSettings, Arg, SubCommand}; use glob::glob; use std::path::{Path, PathBuf}; use std::{env, fs, u64}; +use tree_sitter::Point; use tree_sitter_cli::parse::ParseOutput; use tree_sitter_cli::{ generate, highlight, logger, parse, playground, query, tags, test, test_highlight, test_tags, @@ -173,6 +174,8 @@ fn run() -> Result<()> { .index(1) .required(true), ) + .arg(&time_arg) + .arg(&quiet_arg) .arg(&paths_file_arg) .arg(&paths_arg.clone().index(2)) .arg( @@ -181,6 +184,12 @@ fn run() -> Result<()> { .long("byte-range") .takes_value(true), ) + .arg( + Arg::with_name("row-range") + .help("The range of rows in which the query will be executed") + .long("row-range") + .takes_value(true), + ) .arg(&scope_arg) .arg(Arg::with_name("captures").long("captures").short("c")) .arg(Arg::with_name("test").long("test")), @@ -456,6 +465,8 @@ fn run() -> Result<()> { ("query", Some(matches)) => { let ordered_captures = matches.values_of("captures").is_some(); + let quiet = matches.values_of("quiet").is_some(); + let time = matches.values_of("time").is_some(); let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; let loader_config = config.get()?; loader.find_all_languages(&loader_config)?; @@ -465,9 +476,17 @@ fn run() -> Result<()> { matches.value_of("scope"), )?; let query_path = Path::new(matches.value_of("query-path").unwrap()); - let range = matches.value_of("byte-range").map(|br| { - let r: Vec<&str> = br.split(":").collect(); - r[0].parse().unwrap()..r[1].parse().unwrap() + let byte_range = matches.value_of("byte-range").and_then(|arg| { + let mut parts = arg.split(":"); + let start = parts.next()?.parse().ok()?; + let end = parts.next().unwrap().parse().ok()?; + Some(start..end) + }); + let point_range = matches.value_of("row-range").and_then(|arg| { + let mut parts = arg.split(":"); + let start = parts.next()?.parse().ok()?; + let end = parts.next().unwrap().parse().ok()?; + Some(Point::new(start, 0)..Point::new(end, 0)) }); let should_test = matches.is_present("test"); query::query_files_at_paths( @@ -475,8 +494,11 @@ fn run() -> Result<()> { paths, query_path, ordered_captures, - range, + byte_range, + point_range, should_test, + quiet, + time, )?; } diff --git a/cli/src/query.rs b/cli/src/query.rs index 73d6dd28..fc24cb05 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -5,16 +5,20 @@ use std::{ io::{self, Write}, ops::Range, path::Path, + time::Instant, }; -use tree_sitter::{Language, Parser, Query, QueryCursor}; +use tree_sitter::{Language, Parser, Point, Query, QueryCursor}; pub fn query_files_at_paths( language: Language, paths: Vec, query_path: &Path, ordered_captures: bool, - range: Option>, + byte_range: Option>, + point_range: Option>, should_test: bool, + quiet: bool, + print_time: bool, ) -> Result<()> { let stdout = io::stdout(); let mut stdout = stdout.lock(); @@ -24,9 +28,12 @@ pub fn query_files_at_paths( let query = Query::new(language, &query_source).with_context(|| "Query compilation failed")?; let mut query_cursor = QueryCursor::new(); - if let Some(range) = range { + if let Some(range) = byte_range { query_cursor.set_byte_range(range); } + if let Some(range) = point_range { + query_cursor.set_point_range(range); + } let mut parser = Parser::new(); parser.set_language(language)?; @@ -40,22 +47,25 @@ pub fn query_files_at_paths( fs::read(&path).with_context(|| format!("Error reading source file {:?}", path))?; let tree = parser.parse(&source_code, None).unwrap(); + let start = Instant::now(); if ordered_captures { for (mat, capture_index) in query_cursor.captures(&query, tree.root_node(), source_code.as_slice()) { let capture = mat.captures[capture_index]; let capture_name = &query.capture_names()[capture.index as usize]; - writeln!( - &mut stdout, - " pattern: {:>2}, capture: {} - {}, start: {}, end: {}, text: `{}`", - mat.pattern_index, - capture.index, - capture_name, - capture.node.start_position(), - capture.node.end_position(), - capture.node.utf8_text(&source_code).unwrap_or("") - )?; + if !quiet { + writeln!( + &mut stdout, + " pattern: {:>2}, capture: {} - {}, start: {}, end: {}, text: `{}`", + mat.pattern_index, + capture.index, + capture_name, + capture.node.start_position(), + capture.node.end_position(), + capture.node.utf8_text(&source_code).unwrap_or("") + )?; + } results.push(query_testing::CaptureInfo { name: capture_name.to_string(), start: capture.node.start_position(), @@ -64,27 +74,31 @@ pub fn query_files_at_paths( } } else { for m in query_cursor.matches(&query, tree.root_node(), source_code.as_slice()) { - writeln!(&mut stdout, " pattern: {}", m.pattern_index)?; + if !quiet { + writeln!(&mut stdout, " pattern: {}", m.pattern_index)?; + } for capture in m.captures { let start = capture.node.start_position(); let end = capture.node.end_position(); let capture_name = &query.capture_names()[capture.index as usize]; - if end.row == start.row { - writeln!( - &mut stdout, - " capture: {} - {}, start: {}, end: {}, text: `{}`", - capture.index, - capture_name, - start, - end, - capture.node.utf8_text(&source_code).unwrap_or("") - )?; - } else { - writeln!( - &mut stdout, - " capture: {}, start: {}, end: {}", - capture_name, start, end, - )?; + if !quiet { + if end.row == start.row { + writeln!( + &mut stdout, + " capture: {} - {}, start: {}, end: {}, text: `{}`", + capture.index, + capture_name, + start, + end, + capture.node.utf8_text(&source_code).unwrap_or("") + )?; + } else { + writeln!( + &mut stdout, + " capture: {}, start: {}, end: {}", + capture_name, start, end, + )?; + } } results.push(query_testing::CaptureInfo { name: capture_name.to_string(), @@ -103,6 +117,9 @@ pub fn query_files_at_paths( if should_test { query_testing::assert_expected_captures(results, path, &mut parser, language)? } + if print_time { + writeln!(&mut stdout, "{:?}", start.elapsed())?; + } } Ok(()) From 32ce1fccd05efdf91dd8d99fba0fc91b46b18b81 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 14 Feb 2023 14:42:26 -0800 Subject: [PATCH 2/9] Precompute the set of repetition symbols that can match rootless patterns --- lib/src/query.c | 733 ++++++++++++++++++++++++++++-------------------- 1 file changed, 427 insertions(+), 306 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index 710a9209..a756c089 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -228,12 +228,15 @@ typedef struct { AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH]; uint16_t depth; uint16_t step_index; + TSSymbol root_symbol; } AnalysisState; typedef Array(AnalysisState *) AnalysisStateSet; typedef Array(AnalysisState *) AnalysisStatePool; +typedef Array(uint16_t) StepIndexArray; + /* * AnalysisSubgraph - A subset of the states in the parse table that are used * in constructing nodes with a certain symbol. Each state is accompanied by @@ -253,6 +256,8 @@ typedef struct { Array(AnalysisSubgraphNode) nodes; } AnalysisSubgraph; +typedef Array(AnalysisSubgraph) AnalysisSubgraphArray; + /* * StatePredecessorMap - A map that stores the predecessors of each parse state. * This is used during query analysis to determine which parse states can lead @@ -269,8 +274,8 @@ typedef struct { */ struct TSQuery { SymbolTable captures; - Array(CaptureQuantifiers) capture_quantifiers; SymbolTable predicate_values; + Array(CaptureQuantifiers) capture_quantifiers; Array(QueryStep) steps; Array(PatternEntry) pattern_map; Array(TSQueryPredicateStep) predicate_steps; @@ -278,6 +283,7 @@ struct TSQuery { Array(StepOffset) step_offsets; Array(TSFieldId) negated_fields; Array(char) string_buffer; + Array(TSSymbol) repeat_symbols_with_rootless_patterns; const TSLanguage *language; uint16_t wildcard_root_pattern_count; }; @@ -1113,7 +1119,324 @@ static inline void ts_query__pattern_map_insert( array_insert(&self->pattern_map, index, new_entry); } +static void ts_query__analyze_patterns_from_states( + TSQuery *self, + const AnalysisSubgraphArray *subgraphs, + AnalysisStateSet *states, + AnalysisStateSet *next_states, + AnalysisStateSet *deeper_states, + AnalysisStatePool *state_pool, + StepIndexArray *finished_parent_symbols, + StepIndexArray *final_step_indices, + bool *did_abort_analysis +) { + unsigned recursion_depth_limit = 0; + unsigned prev_final_step_count = 0; + + for (unsigned iteration = 0;; iteration++) { + if (iteration == MAX_ANALYSIS_ITERATION_COUNT) { + *did_abort_analysis = true; + break; + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("Iteration: %u. Final step indices:", iteration); + for (unsigned j = 0; j < final_step_indices->size; j++) { + printf(" %4u", final_step_indices->contents[j]); + } + printf("\n"); + for (unsigned j = 0; j < states->size; j++) { + AnalysisState *state = states->contents[j]; + printf(" %3u: step: %u, stack: [", j, state->step_index); + for (unsigned k = 0; k < state->depth; k++) { + printf( + " {%s, child: %u, state: %4u", + self->language->symbol_names[state->stack[k].parent_symbol], + state->stack[k].child_index, + state->stack[k].parse_state + ); + if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); + if (state->stack[k].done) printf(", DONE"); + printf("}"); + } + printf(" ]\n"); + } + #endif + + // If no further progress can be made within the current recursion depth limit, then + // bump the depth limit by one, and continue to process the states the exceeded the + // limit. But only allow this if progress has been made since the last time the depth + // limit was increased. + if (states->size == 0) { + if ( + deeper_states->size > 0 + && final_step_indices->size > prev_final_step_count + ) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); + #endif + + prev_final_step_count = final_step_indices->size; + recursion_depth_limit++; + AnalysisStateSet _states = *states; + *states = *deeper_states; + *deeper_states = _states; + continue; + } + + break; + } + + analysis_state_set__clear(next_states, state_pool); + for (unsigned j = 0; j < states->size; j++) { + AnalysisState * const state = states->contents[j]; + + // For efficiency, it's important to avoid processing the same analysis state more + // than once. To achieve this, keep the states in order of ascending position within + // their hypothetical syntax trees. In each iteration of this loop, start by advancing + // the states that have made the least progress. Avoid advancing states that have already + // made more progress. + if (next_states->size > 0) { + int comparison = analysis_state__compare_position( + &state, + array_back(next_states) + ); + if (comparison == 0) { + analysis_state_set__insert_sorted_by_clone(next_states, state_pool, state); + continue; + } else if (comparison > 0) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Terminate iteration at state %u\n", j); + #endif + while (j < states->size) { + analysis_state_set__push_by_clone( + next_states, + state_pool, + states->contents[j] + ); + j++; + } + break; + } + } + + const TSStateId parse_state = analysis_state__top(state)->parse_state; + const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; + const TSFieldId parent_field_id = analysis_state__top(state)->field_id; + const unsigned child_index = analysis_state__top(state)->child_index; + const QueryStep * const step = &self->steps.contents[state->step_index]; + + unsigned subgraph_index, exists; + array_search_sorted_by(subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); + if (!exists) continue; + const AnalysisSubgraph *subgraph = &subgraphs->contents[subgraph_index]; + + // Follow every possible path in the parse table, but only visit states that + // are part of the subgraph for the current symbol. + LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); + while (ts_lookahead_iterator_next(&lookahead_iterator)) { + TSSymbol sym = lookahead_iterator.symbol; + + AnalysisSubgraphNode successor = { + .state = parse_state, + .child_index = child_index, + }; + if (lookahead_iterator.action_count) { + const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; + if (action->type == TSParseActionTypeShift) { + if (!action->shift.extra) { + successor.state = action->shift.state; + successor.child_index++; + } + } else { + continue; + } + } else if (lookahead_iterator.next_state != 0) { + successor.state = lookahead_iterator.next_state; + successor.child_index++; + } else { + continue; + } + + unsigned node_index; + array_search_sorted_with( + &subgraph->nodes, + analysis_subgraph_node__compare, &successor, + &node_index, &exists + ); + while (node_index < subgraph->nodes.size) { + AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; + if (node->state != successor.state || node->child_index != successor.child_index) break; + + // Use the subgraph to determine what alias and field will eventually be applied + // to this child node. + TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); + TSSymbol visible_symbol = alias + ? alias + : self->language->symbol_metadata[sym].visible + ? self->language->public_symbol_map[sym] + : 0; + TSFieldId field_id = parent_field_id; + if (!field_id) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); + for (; field_map != field_map_end; field_map++) { + if (!field_map->inherited && field_map->child_index == child_index) { + field_id = field_map->field_id; + break; + } + } + } + + // Create a new state that has advanced past this hypothetical subtree. + AnalysisState next_state = *state; + AnalysisStateEntry *next_state_top = analysis_state__top(&next_state); + next_state_top->child_index = successor.child_index; + next_state_top->parse_state = successor.state; + if (node->done) next_state_top->done = true; + + // Determine if this hypothetical child node would match the current step + // of the query pattern. + bool does_match = false; + if (visible_symbol) { + does_match = true; + if (step->symbol == WILDCARD_SYMBOL) { + if ( + step->is_named && + !self->language->symbol_metadata[visible_symbol].named + ) does_match = false; + } else if (step->symbol != visible_symbol) { + does_match = false; + } + if (step->field && step->field != field_id) { + does_match = false; + } + if ( + step->supertype_symbol && + !analysis_state__has_supertype(state, step->supertype_symbol) + ) does_match = false; + } + + // If this child is hidden, then descend into it and walk through its children. + // If the top entry of the stack is at the end of its rule, then that entry can + // be replaced. Otherwise, push a new entry onto the stack. + else if (sym >= self->language->token_count) { + if (!next_state_top->done) { + if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Exceeded depth limit for state %u\n", j); + #endif + + *did_abort_analysis = true; + continue; + } + + next_state.depth++; + next_state_top = analysis_state__top(&next_state); + } + + *next_state_top = (AnalysisStateEntry) { + .parse_state = parse_state, + .parent_symbol = sym, + .child_index = 0, + .field_id = field_id, + .done = false, + }; + + if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { + analysis_state_set__insert_sorted_by_clone( + deeper_states, + state_pool, + &next_state + ); + continue; + } + } + + // Pop from the stack when this state reached the end of its current syntax node. + while (next_state.depth > 0 && next_state_top->done) { + next_state.depth--; + next_state_top = analysis_state__top(&next_state); + } + + // If this hypothetical child did match the current step of the query pattern, + // then advance to the next step at the current depth. This involves skipping + // over any descendant steps of the current child. + const QueryStep *next_step = step; + if (does_match) { + for (;;) { + next_state.step_index++; + next_step = &self->steps.contents[next_state.step_index]; + if ( + next_step->depth == PATTERN_DONE_MARKER || + next_step->depth <= step->depth + ) break; + } + } else if (successor.state == parse_state) { + continue; + } + + for (;;) { + // Skip pass-through states. Although these states have alternatives, they are only + // used to implement repetitions, and query analysis does not need to process + // repetitions in order to determine whether steps are possible and definite. + if (next_step->is_pass_through) { + next_state.step_index++; + next_step++; + continue; + } + + // If the pattern is finished or hypothetical parent node is complete, then + // record that matching can terminate at this step of the pattern. Otherwise, + // add this state to the list of states to process on the next iteration. + if (!next_step->is_dead_end) { + bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != step->depth; + if (did_finish_pattern) { + array_insert_sorted_by(finished_parent_symbols, , state->root_symbol); + } else if (next_state.depth == 0) { + array_insert_sorted_by(final_step_indices, , next_state.step_index); + } else { + analysis_state_set__insert_sorted_by_clone(next_states, state_pool, &next_state); + } + } + + // If the state has advanced to a step with an alternative step, then add another state + // at that alternative step. This process is simpler than the process of actually matching a + // pattern during query execution, because for the purposes of query analysis, there is no + // need to process repetitions. + if ( + does_match && + next_step->alternative_index != NONE && + next_step->alternative_index > next_state.step_index + ) { + next_state.step_index = next_step->alternative_index; + next_step = &self->steps.contents[next_state.step_index]; + } else { + break; + } + } + } + } + } + + AnalysisStateSet _states = *states; + *states = *next_states; + *next_states = _states; + } +} + static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { + Array(uint16_t) non_rooted_pattern_start_steps = array_new(); + for (unsigned i = 0; i < self->pattern_map.size; i++) { + PatternEntry *pattern = &self->pattern_map.contents[i]; + if (!pattern->is_rooted) { + QueryStep *step = &self->steps.contents[pattern->step_index]; + if (step->symbol != WILDCARD_SYMBOL) { + array_push(&non_rooted_pattern_start_steps, pattern->step_index); + } + } + } + // Walk forward through all of the steps in the query, computing some // basic information about each step. Mark all of the steps that contain // captures, and record the indices of all of the steps that have child steps. @@ -1158,7 +1481,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // of the hidden symbols in the grammar, because these might occur within // one of the parent nodes, such that their children appear to belong to the // parent. - Array(AnalysisSubgraph) subgraphs = array_new(); + AnalysisSubgraphArray subgraphs = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint32_t parent_step_index = parent_step_indices.contents[i]; TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; @@ -1324,7 +1647,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { AnalysisStateSet next_states = array_new(); AnalysisStateSet deeper_states = array_new(); AnalysisStatePool state_pool = array_new(); - Array(uint16_t) final_step_indices = array_new(); + StepIndexArray final_step_indices = array_new(); + StepIndexArray finished_parent_symbols = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint16_t parent_step_index = parent_step_indices.contents[i]; uint16_t parent_depth = self->steps.contents[parent_step_index].depth; @@ -1364,308 +1688,31 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { }, }, .depth = 1, + .root_symbol = parent_symbol, })); } // Walk the subgraph for this non-terminal, tracking all of the possible // sequences of progress within the pattern. - bool can_finish_pattern = false; bool did_abort_analysis = false; - unsigned recursion_depth_limit = 0; - unsigned prev_final_step_count = 0; array_clear(&final_step_indices); - for (unsigned iteration = 0;; iteration++) { - if (iteration == MAX_ANALYSIS_ITERATION_COUNT) { - did_abort_analysis = true; - break; - } + array_clear(&finished_parent_symbols); - #ifdef DEBUG_ANALYZE_QUERY - printf("Iteration: %u. Final step indices:", iteration); - for (unsigned j = 0; j < final_step_indices.size; j++) { - printf(" %4u", final_step_indices.contents[j]); - } - printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol)); - for (unsigned j = 0; j < states.size; j++) { - AnalysisState *state = states.contents[j]; - printf(" %3u: step: %u, stack: [", j, state->step_index); - for (unsigned k = 0; k < state->depth; k++) { - printf( - " {%s, child: %u, state: %4u", - self->language->symbol_names[state->stack[k].parent_symbol], - state->stack[k].child_index, - state->stack[k].parse_state - ); - if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); - if (state->stack[k].done) printf(", DONE"); - printf("}"); - } - printf(" ]\n"); - } - #endif + #ifdef DEBUG_ANALYZE_QUERY + printf("\nWalk states for %s:\n", ts_language_symbol_name(self->language, states.contents[0]->stack[0].parent_symbol)); + #endif - // If no further progress can be made within the current recursion depth limit, then - // bump the depth limit by one, and continue to process the states the exceeded the - // limit. But only allow this if progress has been made since the last time the depth - // limit was increased. - if (states.size == 0) { - if ( - deeper_states.size > 0 - && final_step_indices.size > prev_final_step_count - ) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); - #endif - - prev_final_step_count = final_step_indices.size; - recursion_depth_limit++; - AnalysisStateSet _states = states; - states = deeper_states; - deeper_states = _states; - continue; - } - - break; - } - - analysis_state_set__clear(&next_states, &state_pool); - for (unsigned j = 0; j < states.size; j++) { - AnalysisState * const state = states.contents[j]; - - // For efficiency, it's important to avoid processing the same analysis state more - // than once. To achieve this, keep the states in order of ascending position within - // their hypothetical syntax trees. In each iteration of this loop, start by advancing - // the states that have made the least progress. Avoid advancing states that have already - // made more progress. - if (next_states.size > 0) { - int comparison = analysis_state__compare_position( - &state, - array_back(&next_states) - ); - if (comparison == 0) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Skip iteration for state %u\n", j); - #endif - analysis_state_set__insert_sorted_by_clone(&next_states, &state_pool, state); - continue; - } else if (comparison > 0) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Terminate iteration at state %u\n", j); - #endif - while (j < states.size) { - analysis_state_set__push_by_clone( - &next_states, - &state_pool, - states.contents[j] - ); - j++; - } - break; - } - } - - const TSStateId parse_state = analysis_state__top(state)->parse_state; - const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; - const TSFieldId parent_field_id = analysis_state__top(state)->field_id; - const unsigned child_index = analysis_state__top(state)->child_index; - const QueryStep * const step = &self->steps.contents[state->step_index]; - - unsigned subgraph_index, exists; - array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); - if (!exists) continue; - const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; - - // Follow every possible path in the parse table, but only visit states that - // are part of the subgraph for the current symbol. - LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); - while (ts_lookahead_iterator_next(&lookahead_iterator)) { - TSSymbol sym = lookahead_iterator.symbol; - - AnalysisSubgraphNode successor = { - .state = parse_state, - .child_index = child_index, - }; - if (lookahead_iterator.action_count) { - const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; - if (action->type == TSParseActionTypeShift) { - if (!action->shift.extra) { - successor.state = action->shift.state; - successor.child_index++; - } - } else { - continue; - } - } else if (lookahead_iterator.next_state != 0) { - successor.state = lookahead_iterator.next_state; - successor.child_index++; - } else { - continue; - } - - unsigned node_index; - array_search_sorted_with( - &subgraph->nodes, - analysis_subgraph_node__compare, &successor, - &node_index, &exists - ); - while (node_index < subgraph->nodes.size) { - AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; - if (node->state != successor.state || node->child_index != successor.child_index) break; - - // Use the subgraph to determine what alias and field will eventually be applied - // to this child node. - TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); - TSSymbol visible_symbol = alias - ? alias - : self->language->symbol_metadata[sym].visible - ? self->language->public_symbol_map[sym] - : 0; - TSFieldId field_id = parent_field_id; - if (!field_id) { - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); - for (; field_map != field_map_end; field_map++) { - if (!field_map->inherited && field_map->child_index == child_index) { - field_id = field_map->field_id; - break; - } - } - } - - // Create a new state that has advanced past this hypothetical subtree. - AnalysisState next_state = *state; - AnalysisStateEntry *next_state_top = analysis_state__top(&next_state); - next_state_top->child_index = successor.child_index; - next_state_top->parse_state = successor.state; - if (node->done) next_state_top->done = true; - - // Determine if this hypothetical child node would match the current step - // of the query pattern. - bool does_match = false; - if (visible_symbol) { - does_match = true; - if (step->symbol == WILDCARD_SYMBOL) { - if ( - step->is_named && - !self->language->symbol_metadata[visible_symbol].named - ) does_match = false; - } else if (step->symbol != visible_symbol) { - does_match = false; - } - if (step->field && step->field != field_id) { - does_match = false; - } - if ( - step->supertype_symbol && - !analysis_state__has_supertype(state, step->supertype_symbol) - ) does_match = false; - } - - // If this child is hidden, then descend into it and walk through its children. - // If the top entry of the stack is at the end of its rule, then that entry can - // be replaced. Otherwise, push a new entry onto the stack. - else if (sym >= self->language->token_count) { - if (!next_state_top->done) { - if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Exceeded depth limit for state %u\n", j); - #endif - - did_abort_analysis = true; - continue; - } - - next_state.depth++; - next_state_top = analysis_state__top(&next_state); - } - - *next_state_top = (AnalysisStateEntry) { - .parse_state = parse_state, - .parent_symbol = sym, - .child_index = 0, - .field_id = field_id, - .done = false, - }; - - if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { - analysis_state_set__insert_sorted_by_clone( - &deeper_states, - &state_pool, - &next_state - ); - continue; - } - } - - // Pop from the stack when this state reached the end of its current syntax node. - while (next_state.depth > 0 && next_state_top->done) { - next_state.depth--; - next_state_top = analysis_state__top(&next_state); - } - - // If this hypothetical child did match the current step of the query pattern, - // then advance to the next step at the current depth. This involves skipping - // over any descendant steps of the current child. - const QueryStep *next_step = step; - if (does_match) { - for (;;) { - next_state.step_index++; - next_step = &self->steps.contents[next_state.step_index]; - if ( - next_step->depth == PATTERN_DONE_MARKER || - next_step->depth <= parent_depth + 1 - ) break; - } - } else if (successor.state == parse_state) { - continue; - } - - for (;;) { - // Skip pass-through states. Although these states have alternatives, they are only - // used to implement repetitions, and query analysis does not need to process - // repetitions in order to determine whether steps are possible and definite. - if (next_step->is_pass_through) { - next_state.step_index++; - next_step++; - continue; - } - - // If the pattern is finished or hypothetical parent node is complete, then - // record that matching can terminate at this step of the pattern. Otherwise, - // add this state to the list of states to process on the next iteration. - if (!next_step->is_dead_end) { - bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1; - if (did_finish_pattern) can_finish_pattern = true; - if (did_finish_pattern || next_state.depth == 0) { - array_insert_sorted_by(&final_step_indices, , next_state.step_index); - } else { - analysis_state_set__insert_sorted_by_clone(&next_states, &state_pool, &next_state); - } - } - - // If the state has advanced to a step with an alternative step, then add another state - // at that alternative step. This process is simpler than the process of actually matching a - // pattern during query execution, because for the purposes of query analysis, there is no - // need to process repetitions. - if ( - does_match && - next_step->alternative_index != NONE && - next_step->alternative_index > next_state.step_index - ) { - next_state.step_index = next_step->alternative_index; - next_step = &self->steps.contents[next_state.step_index]; - } else { - break; - } - } - } - } - } - - AnalysisStateSet _states = states; - states = next_states; - next_states = _states; - } + ts_query__analyze_patterns_from_states( + self, + &subgraphs, + &states, + &next_states, + &deeper_states, + &state_pool, + &finished_parent_symbols, + &final_step_indices, + &did_abort_analysis + ); // If this pattern could not be fully analyzed, then every step should // be considered fallible. @@ -1686,7 +1733,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // If this pattern cannot match, store the pattern index so that it can be // returned to the caller. - if (!can_finish_pattern) { + if (finished_parent_symbols.size == 0) { assert(final_step_indices.size > 0); uint16_t impossible_step_index = *array_back(&final_step_indices); uint32_t i, exists; @@ -1810,6 +1857,75 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } #endif + // Determine which repetition symbols in this language have the possibility + // of matching non-rooted patterns in this query. These repetition symbols + // prevent certain optimizations with range restrictions. + bool did_abort_analysis = false; + for (uint32_t i = 0; i < non_rooted_pattern_start_steps.size; i++) { + uint16_t step_index = non_rooted_pattern_start_steps.contents[i]; + + analysis_state_set__clear(&states, &state_pool); + analysis_state_set__clear(&deeper_states, &state_pool); + + for (unsigned j = 0; j < subgraphs.size; j++) { + AnalysisSubgraph *subgraph = &subgraphs.contents[j]; + TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, subgraph->symbol); + if (metadata.visible || metadata.named) continue; + for (uint32_t k = 0; k < subgraph->start_states.size; k++) { + TSStateId parse_state = subgraph->start_states.contents[k]; + analysis_state_set__push_by_clone(&states, &state_pool, &((AnalysisState) { + .step_index = step_index, + .stack = { + [0] = { + .parse_state = parse_state, + .parent_symbol = subgraph->symbol, + .child_index = 0, + .field_id = 0, + .done = false, + }, + }, + .root_symbol = subgraph->symbol, + .depth = 1, + })); + } + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("\nWalk states for rootless pattern step %u:\n", step_index); + #endif + + array_clear(&final_step_indices); + array_clear(&finished_parent_symbols); + ts_query__analyze_patterns_from_states( + self, + &subgraphs, + &states, + &next_states, + &deeper_states, + &state_pool, + &finished_parent_symbols, + &final_step_indices, + &did_abort_analysis + ); + + for (unsigned k = 0; k < finished_parent_symbols.size; k++) { + TSSymbol symbol = finished_parent_symbols.contents[k]; + array_insert_sorted_by(&self->repeat_symbols_with_rootless_patterns, , symbol); + } + } + + #ifdef DEBUG_ANALYZE_QUERY + if (self->repeat_symbols_with_rootless_patterns.size > 0) { + printf("\nRepetition symbols with rootless patterns:\n"); + printf("aborted analysis: %d\n", did_abort_analysis); + for (unsigned i = 0; i < self->repeat_symbols_with_rootless_patterns.size; i++) { + TSSymbol symbol = self->repeat_symbols_with_rootless_patterns.contents[i]; + printf(" %u, %s\n", symbol, ts_language_symbol_name(self->language, symbol)); + } + printf("\n"); + } + #endif + // Cleanup for (unsigned i = 0; i < subgraphs.size; i++) { array_delete(&subgraphs.contents[i].start_states); @@ -1821,9 +1937,11 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } array_delete(&state_pool); array_delete(&next_nodes); + array_delete(&non_rooted_pattern_start_steps); analysis_state_set__delete(&states); analysis_state_set__delete(&next_states); analysis_state_set__delete(&deeper_states); + array_delete(&finished_parent_symbols); array_delete(&final_step_indices); array_delete(&parent_step_indices); array_delete(&predicate_capture_ids); @@ -2571,6 +2689,7 @@ TSQuery *ts_query_new( .step_offsets = array_new(), .string_buffer = array_new(), .negated_fields = array_new(), + .repeat_symbols_with_rootless_patterns = array_new(), .wildcard_root_pattern_count = 0, .language = language, }; @@ -2685,6 +2804,7 @@ void ts_query_delete(TSQuery *self) { array_delete(&self->step_offsets); array_delete(&self->string_buffer); array_delete(&self->negated_fields); + array_delete(&self->repeat_symbols_with_rootless_patterns); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); for (uint32_t index = 0; index < self->capture_quantifiers.size; index++) { @@ -3327,18 +3447,18 @@ static inline bool ts_query_cursor__advance( self->finished_states.size ); - bool node_intersects_range = ( - ts_node_end_byte(node) > self->start_byte && - ts_node_start_byte(node) < self->end_byte && - point_gt(ts_node_end_point(node), self->start_point) && - point_lt(ts_node_start_point(node), self->end_point) - ); bool parent_intersects_range = ts_node_is_null(parent_node) || ( ts_node_end_byte(parent_node) > self->start_byte && ts_node_start_byte(parent_node) < self->end_byte && point_gt(ts_node_end_point(parent_node), self->start_point) && point_lt(ts_node_start_point(parent_node), self->end_point) ); + bool node_intersects_range = parent_intersects_range && ( + ts_node_end_byte(node) > self->start_byte && + ts_node_start_byte(node) < self->end_byte && + point_gt(ts_node_end_point(node), self->start_point) && + point_lt(ts_node_start_point(node), self->end_point) + ); bool node_is_error = symbol == ts_builtin_sym_error; bool parent_is_error = !ts_node_is_null(parent_node) && @@ -3679,8 +3799,8 @@ static inline bool ts_query_cursor__advance( // When the current node ends prior to the desired start offset, // only descend for the purpose of continuing in-progress matches. - bool should_descend = node_intersects_range; - if (!should_descend) { + bool has_in_progress_matches = false; + if (!node_intersects_range) { for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i];; QueryStep *next_step = &self->query->steps.contents[state->step_index]; @@ -3688,12 +3808,13 @@ static inline bool ts_query_cursor__advance( next_step->depth != PATTERN_DONE_MARKER && state->start_depth + next_step->depth > self->depth ) { - should_descend = true; + has_in_progress_matches = true; break; } } } + bool should_descend = node_intersects_range || has_in_progress_matches; if (!should_descend) { LOG( " not descending. node end byte: %u, start byte: %u\n", From 189cf6d59daa7861f504c74d0a775b8f53cf98e2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 14 Feb 2023 15:18:00 -0800 Subject: [PATCH 3/9] Group analysis state sets into QueryAnalysis struct --- lib/src/query.c | 218 ++++++++++++++++++++++-------------------------- 1 file changed, 102 insertions(+), 116 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index a756c089..cbc9add6 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -233,9 +233,15 @@ typedef struct { typedef Array(AnalysisState *) AnalysisStateSet; -typedef Array(AnalysisState *) AnalysisStatePool; - -typedef Array(uint16_t) StepIndexArray; +typedef struct { + AnalysisStateSet states; + AnalysisStateSet next_states; + AnalysisStateSet deeper_states; + AnalysisStateSet state_pool; + Array(uint16_t) final_step_indices; + Array(TSSymbol) finished_parent_symbols; + bool did_abort; +} QueryAnalysis; /* * AnalysisSubgraph - A subset of the states in the parse table that are used @@ -940,30 +946,23 @@ static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol s return false; } -static inline AnalysisState *analysis_state__clone(AnalysisState const *self) { - AnalysisState *new_state = ts_malloc(sizeof(AnalysisState)); - *new_state = *self; - return new_state; -} - -/**************** +/****************** * AnalysisStateSet - ****************/ + ******************/ // Obtains an `AnalysisState` instance, either by consuming one from this set's object pool, or by // cloning one from scratch. static inline AnalysisState *analysis_state_pool__clone_or_reuse( - AnalysisStatePool *self, + AnalysisStateSet *self, AnalysisState *borrowed_item ) { AnalysisState *new_item; if (self->size) { new_item = array_pop(self); - *new_item = *borrowed_item; } else { - new_item = analysis_state__clone(borrowed_item); + new_item = ts_malloc(sizeof(AnalysisState)); } - + *new_item = *borrowed_item; return new_item; } @@ -973,9 +972,9 @@ static inline AnalysisState *analysis_state_pool__clone_or_reuse( // // The caller retains ownership of the passed-in memory. However, the clone that is created by this // function will be managed by the state set. -static inline void analysis_state_set__insert_sorted_by_clone( +static inline void analysis_state_set__insert_sorted( AnalysisStateSet *self, - AnalysisStatePool *pool, + AnalysisStateSet *pool, AnalysisState *borrowed_item ) { unsigned index, exists; @@ -994,9 +993,9 @@ static inline void analysis_state_set__insert_sorted_by_clone( // // The caller retains ownership of the passed-in memory. However, the clone that is created by this // function will be managed by the state set. -static inline void analysis_state_set__push_by_clone( +static inline void analysis_state_set__push( AnalysisStateSet *self, - AnalysisStatePool *pool, + AnalysisStateSet *pool, AnalysisState *borrowed_item ) { AnalysisState *new_item = analysis_state_pool__clone_or_reuse(pool, borrowed_item); @@ -1004,7 +1003,7 @@ static inline void analysis_state_set__push_by_clone( } // Removes all items from this set, returning it to an empty state. -static inline void analysis_state_set__clear(AnalysisStateSet *self, AnalysisStatePool *pool) { +static inline void analysis_state_set__clear(AnalysisStateSet *self, AnalysisStateSet *pool) { array_push_all(pool, self); array_clear(self); } @@ -1018,6 +1017,31 @@ static inline void analysis_state_set__delete(AnalysisStateSet *self) { array_delete(self); } +/**************** + * QueryAnalyzer + ****************/ + +static inline QueryAnalysis query_analysis__new() { + return (QueryAnalysis) { + .states = array_new(), + .next_states = array_new(), + .deeper_states = array_new(), + .state_pool = array_new(), + .final_step_indices = array_new(), + .finished_parent_symbols = array_new(), + .did_abort = false, + }; +} + +static inline void query_analysis__delete(QueryAnalysis *self) { + analysis_state_set__delete(&self->states); + analysis_state_set__delete(&self->next_states); + analysis_state_set__delete(&self->deeper_states); + analysis_state_set__delete(&self->state_pool); + array_delete(&self->final_step_indices); + array_delete(&self->finished_parent_symbols); +} + /*********************** * AnalysisSubgraphNode ***********************/ @@ -1119,23 +1143,21 @@ static inline void ts_query__pattern_map_insert( array_insert(&self->pattern_map, index, new_entry); } -static void ts_query__analyze_patterns_from_states( +// Walk the subgraph for this non-terminal, tracking all of the possible +// sequences of progress within the pattern. +static void ts_query__perform_analysis( TSQuery *self, const AnalysisSubgraphArray *subgraphs, - AnalysisStateSet *states, - AnalysisStateSet *next_states, - AnalysisStateSet *deeper_states, - AnalysisStatePool *state_pool, - StepIndexArray *finished_parent_symbols, - StepIndexArray *final_step_indices, - bool *did_abort_analysis + QueryAnalysis *analysis ) { unsigned recursion_depth_limit = 0; unsigned prev_final_step_count = 0; + array_clear(&analysis->final_step_indices); + array_clear(&analysis->finished_parent_symbols); for (unsigned iteration = 0;; iteration++) { if (iteration == MAX_ANALYSIS_ITERATION_COUNT) { - *did_abort_analysis = true; + analysis->did_abort = true; break; } @@ -1167,52 +1189,52 @@ static void ts_query__analyze_patterns_from_states( // bump the depth limit by one, and continue to process the states the exceeded the // limit. But only allow this if progress has been made since the last time the depth // limit was increased. - if (states->size == 0) { + if (analysis->states.size == 0) { if ( - deeper_states->size > 0 - && final_step_indices->size > prev_final_step_count + analysis->deeper_states.size > 0 && + analysis->final_step_indices.size > prev_final_step_count ) { #ifdef DEBUG_ANALYZE_QUERY printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); #endif - prev_final_step_count = final_step_indices->size; + prev_final_step_count = analysis->final_step_indices.size; recursion_depth_limit++; - AnalysisStateSet _states = *states; - *states = *deeper_states; - *deeper_states = _states; + AnalysisStateSet _states = analysis->states; + analysis->states = analysis->deeper_states; + analysis->deeper_states = _states; continue; } break; } - analysis_state_set__clear(next_states, state_pool); - for (unsigned j = 0; j < states->size; j++) { - AnalysisState * const state = states->contents[j]; + analysis_state_set__clear(&analysis->next_states, &analysis->state_pool); + for (unsigned j = 0; j < analysis->states.size; j++) { + AnalysisState * const state = analysis->states.contents[j]; // For efficiency, it's important to avoid processing the same analysis state more // than once. To achieve this, keep the states in order of ascending position within // their hypothetical syntax trees. In each iteration of this loop, start by advancing // the states that have made the least progress. Avoid advancing states that have already // made more progress. - if (next_states->size > 0) { + if (analysis->next_states.size > 0) { int comparison = analysis_state__compare_position( &state, - array_back(next_states) + array_back(&analysis->next_states) ); if (comparison == 0) { - analysis_state_set__insert_sorted_by_clone(next_states, state_pool, state); + analysis_state_set__insert_sorted(&analysis->next_states, &analysis->state_pool, state); continue; } else if (comparison > 0) { #ifdef DEBUG_ANALYZE_QUERY printf("Terminate iteration at state %u\n", j); #endif - while (j < states->size) { - analysis_state_set__push_by_clone( - next_states, - state_pool, - states->contents[j] + while (j < analysis->states.size) { + analysis_state_set__push( + &analysis->next_states, + &analysis->state_pool, + analysis->states.contents[j] ); j++; } @@ -1327,7 +1349,7 @@ static void ts_query__analyze_patterns_from_states( printf("Exceeded depth limit for state %u\n", j); #endif - *did_abort_analysis = true; + analysis->did_abort = true; continue; } @@ -1344,9 +1366,9 @@ static void ts_query__analyze_patterns_from_states( }; if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { - analysis_state_set__insert_sorted_by_clone( - deeper_states, - state_pool, + analysis_state_set__insert_sorted( + &analysis->deeper_states, + &analysis->state_pool, &next_state ); continue; @@ -1392,11 +1414,11 @@ static void ts_query__analyze_patterns_from_states( if (!next_step->is_dead_end) { bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != step->depth; if (did_finish_pattern) { - array_insert_sorted_by(finished_parent_symbols, , state->root_symbol); + array_insert_sorted_by(&analysis->finished_parent_symbols, , state->root_symbol); } else if (next_state.depth == 0) { - array_insert_sorted_by(final_step_indices, , next_state.step_index); + array_insert_sorted_by(&analysis->final_step_indices, , next_state.step_index); } else { - analysis_state_set__insert_sorted_by_clone(next_states, state_pool, &next_state); + analysis_state_set__insert_sorted(&analysis->next_states, &analysis->state_pool, &next_state); } } @@ -1419,9 +1441,9 @@ static void ts_query__analyze_patterns_from_states( } } - AnalysisStateSet _states = *states; - *states = *next_states; - *next_states = _states; + AnalysisStateSet _states = analysis->states; + analysis->states = analysis->next_states; + analysis->next_states = _states; } } @@ -1643,12 +1665,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // For each non-terminal pattern, determine if the pattern can successfully match, // and identify all of the possible children within the pattern where matching could fail. bool all_patterns_are_valid = true; - AnalysisStateSet states = array_new(); - AnalysisStateSet next_states = array_new(); - AnalysisStateSet deeper_states = array_new(); - AnalysisStatePool state_pool = array_new(); - StepIndexArray final_step_indices = array_new(); - StepIndexArray finished_parent_symbols = array_new(); + QueryAnalysis analysis = query_analysis__new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint16_t parent_step_index = parent_step_indices.contents[i]; uint16_t parent_depth = self->steps.contents[parent_step_index].depth; @@ -1672,11 +1689,11 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Initialize an analysis state at every parse state in the table where // this parent symbol can occur. AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; - analysis_state_set__clear(&states, &state_pool); - analysis_state_set__clear(&deeper_states, &state_pool); + analysis_state_set__clear(&analysis.states, &analysis.state_pool); + analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); for (unsigned j = 0; j < subgraph->start_states.size; j++) { TSStateId parse_state = subgraph->start_states.contents[j]; - analysis_state_set__push_by_clone(&states, &state_pool, &((AnalysisState) { + analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { .step_index = parent_step_index + 1, .stack = { [0] = { @@ -1692,31 +1709,16 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { })); } - // Walk the subgraph for this non-terminal, tracking all of the possible - // sequences of progress within the pattern. - bool did_abort_analysis = false; - array_clear(&final_step_indices); - array_clear(&finished_parent_symbols); - #ifdef DEBUG_ANALYZE_QUERY printf("\nWalk states for %s:\n", ts_language_symbol_name(self->language, states.contents[0]->stack[0].parent_symbol)); #endif - ts_query__analyze_patterns_from_states( - self, - &subgraphs, - &states, - &next_states, - &deeper_states, - &state_pool, - &finished_parent_symbols, - &final_step_indices, - &did_abort_analysis - ); + analysis.did_abort = false; + ts_query__perform_analysis(self, &subgraphs, &analysis); // If this pattern could not be fully analyzed, then every step should // be considered fallible. - if (did_abort_analysis) { + if (analysis.did_abort) { for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) { QueryStep *step = &self->steps.contents[j]; if ( @@ -1733,9 +1735,9 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // If this pattern cannot match, store the pattern index so that it can be // returned to the caller. - if (finished_parent_symbols.size == 0) { - assert(final_step_indices.size > 0); - uint16_t impossible_step_index = *array_back(&final_step_indices); + if (analysis.finished_parent_symbols.size == 0) { + assert(analysis.final_step_indices.size > 0); + uint16_t impossible_step_index = *array_back(&analysis.final_step_indices); uint32_t i, exists; array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists); if (i >= self->step_offsets.size) i = self->step_offsets.size - 1; @@ -1746,8 +1748,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Mark as fallible any step where a match terminated. // Later, this property will be propagated to all of the step's predecessors. - for (unsigned j = 0; j < final_step_indices.size; j++) { - uint32_t final_step_index = final_step_indices.contents[j]; + for (unsigned j = 0; j < analysis.final_step_indices.size; j++) { + uint32_t final_step_index = analysis.final_step_indices.contents[j]; QueryStep *step = &self->steps.contents[final_step_index]; if ( step->depth != PATTERN_DONE_MARKER && @@ -1860,20 +1862,20 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Determine which repetition symbols in this language have the possibility // of matching non-rooted patterns in this query. These repetition symbols // prevent certain optimizations with range restrictions. - bool did_abort_analysis = false; + analysis.did_abort = false; for (uint32_t i = 0; i < non_rooted_pattern_start_steps.size; i++) { uint16_t step_index = non_rooted_pattern_start_steps.contents[i]; - analysis_state_set__clear(&states, &state_pool); - analysis_state_set__clear(&deeper_states, &state_pool); - + analysis_state_set__clear(&analysis.states, &analysis.state_pool); + analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); for (unsigned j = 0; j < subgraphs.size; j++) { AnalysisSubgraph *subgraph = &subgraphs.contents[j]; TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, subgraph->symbol); if (metadata.visible || metadata.named) continue; + for (uint32_t k = 0; k < subgraph->start_states.size; k++) { TSStateId parse_state = subgraph->start_states.contents[k]; - analysis_state_set__push_by_clone(&states, &state_pool, &((AnalysisState) { + analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { .step_index = step_index, .stack = { [0] = { @@ -1894,22 +1896,14 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { printf("\nWalk states for rootless pattern step %u:\n", step_index); #endif - array_clear(&final_step_indices); - array_clear(&finished_parent_symbols); - ts_query__analyze_patterns_from_states( + ts_query__perform_analysis( self, &subgraphs, - &states, - &next_states, - &deeper_states, - &state_pool, - &finished_parent_symbols, - &final_step_indices, - &did_abort_analysis + &analysis ); - for (unsigned k = 0; k < finished_parent_symbols.size; k++) { - TSSymbol symbol = finished_parent_symbols.contents[k]; + for (unsigned k = 0; k < analysis.finished_parent_symbols.size; k++) { + TSSymbol symbol = analysis.finished_parent_symbols.contents[k]; array_insert_sorted_by(&self->repeat_symbols_with_rootless_patterns, , symbol); } } @@ -1917,7 +1911,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { #ifdef DEBUG_ANALYZE_QUERY if (self->repeat_symbols_with_rootless_patterns.size > 0) { printf("\nRepetition symbols with rootless patterns:\n"); - printf("aborted analysis: %d\n", did_abort_analysis); + printf("aborted analysis: %d\n", analyzer.did_abort); for (unsigned i = 0; i < self->repeat_symbols_with_rootless_patterns.size; i++) { TSSymbol symbol = self->repeat_symbols_with_rootless_patterns.contents[i]; printf(" %u, %s\n", symbol, ts_language_symbol_name(self->language, symbol)); @@ -1932,17 +1926,9 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { array_delete(&subgraphs.contents[i].nodes); } array_delete(&subgraphs); - for (unsigned i = 0; i < state_pool.size; i++) { - ts_free(state_pool.contents[i]); - } - array_delete(&state_pool); + query_analysis__delete(&analysis); array_delete(&next_nodes); array_delete(&non_rooted_pattern_start_steps); - analysis_state_set__delete(&states); - analysis_state_set__delete(&next_states); - analysis_state_set__delete(&deeper_states); - array_delete(&finished_parent_symbols); - array_delete(&final_step_indices); array_delete(&parent_step_indices); array_delete(&predicate_capture_ids); state_predecessor_map_delete(&predecessor_map); From 29c9073177d4e5f750daa9619feab75701a9a286 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 15 Feb 2023 14:01:59 -0800 Subject: [PATCH 4/9] Extract 'internal' versions of tree cursor movement fns that allow visiting hidden nodes --- lib/src/tree_cursor.c | 126 +++++++++++++++++++----------------------- lib/src/tree_cursor.h | 15 +++++ 2 files changed, 72 insertions(+), 69 deletions(-) diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index e8dc98a9..98930250 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -98,34 +98,43 @@ void ts_tree_cursor_delete(TSTreeCursor *_self) { // TSTreeCursor - walking the tree -bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) { +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; - - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - if (visible) { - array_push(&self->stack, entry); - return true; - } - - if (ts_subtree_visible_child_count(*entry.subtree) > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + if (visible) { + array_push(&self->stack, entry); + return TreeCursorStepVisible; } - } while (did_descend); + if (ts_subtree_visible_child_count(*entry.subtree) > 0) { + array_push(&self->stack, entry); + return TreeCursorStepHidden; + } + } + return TreeCursorStepNone; +} +bool ts_tree_cursor_goto_first_child(TSTreeCursor *self) { + for (;;) { + switch (ts_tree_cursor_goto_first_child_internal(self)) { + case TreeCursorStepHidden: + continue; + case TreeCursorStepVisible: + return true; + default: + return false; + } + } return false; } -int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) { +static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point( + TSTreeCursor *_self, + uint32_t goal_byte, + TSPoint goal_point +) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; uint32_t visible_child_index = 0; @@ -138,48 +147,8 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t g TreeCursorEntry entry; CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes; - bool at_goal = end_byte >= goal_byte; - uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); - - if (at_goal) { - if (visible) { - array_push(&self->stack, entry); - return visible_child_index; - } - - if (visible_child_count > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } - } else if (visible) { - visible_child_index++; - } else { - visible_child_index += visible_child_count; - } - } - } while (did_descend); - - self->stack.size = initial_size; - return -1; -} - -int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *_self, TSPoint goal_point) { - TreeCursor *self = (TreeCursor *)_self; - uint32_t initial_size = self->stack.size; - uint32_t visible_child_index = 0; - - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - TSPoint end_point = point_add(entry.position.extent, ts_subtree_size(*entry.subtree).extent); - bool at_goal = point_gte(end_point, goal_point); + Length entry_end = length_add(entry.position, ts_subtree_size(*entry.subtree)); + bool at_goal = entry_end.bytes >= goal_byte && point_gte(entry_end.extent, goal_point); uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); if (at_goal) { if (visible) { @@ -203,7 +172,15 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *_self, TSPoint g return -1; } -bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { +int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *self, uint32_t goal_byte) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, goal_byte, POINT_ZERO); +} + +int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint goal_point) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, 0, goal_point); +} + +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; @@ -221,19 +198,30 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { if (visible) { array_push(&self->stack, entry); - return true; + return TreeCursorStepVisible; } if (ts_subtree_visible_child_count(*entry.subtree)) { array_push(&self->stack, entry); - ts_tree_cursor_goto_first_child(_self); - return true; + return TreeCursorStepHidden; } } } self->stack.size = initial_size; - return false; + return TreeCursorStepNone; +} + +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self) { + switch (ts_tree_cursor_goto_next_sibling_internal(self)) { + case TreeCursorStepHidden: + ts_tree_cursor_goto_first_child(self); + return true; + case TreeCursorStepVisible: + return true; + default: + return false; + } } bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { diff --git a/lib/src/tree_cursor.h b/lib/src/tree_cursor.h index 69647d1d..7b94db6b 100644 --- a/lib/src/tree_cursor.h +++ b/lib/src/tree_cursor.h @@ -15,6 +15,12 @@ typedef struct { Array(TreeCursorEntry) stack; } TreeCursor; +typedef enum { + TreeCursorStepNone, + TreeCursorStepHidden, + TreeCursorStepVisible, +} TreeCursorStep; + void ts_tree_cursor_init(TreeCursor *, TSNode); void ts_tree_cursor_current_status( const TSTreeCursor *, @@ -26,6 +32,15 @@ void ts_tree_cursor_current_status( unsigned * ); +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *); +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *); + +static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + TreeCursorEntry *last_entry = array_back(&self->stack); + return *last_entry->subtree; +} + TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); #endif // TREE_SITTER_TREE_CURSOR_H_ From fa869cf3eddac07d82bfd48f7fda0a0705087a51 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 15 Feb 2023 14:03:15 -0800 Subject: [PATCH 5/9] Restructure query_cursor_advance to explicitly control which hidden nodes it descends into --- lib/src/query.c | 914 ++++++++++++++++++++++++---------------------- lib/src/subtree.h | 6 + 2 files changed, 492 insertions(+), 428 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index cbc9add6..04a59f9a 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -309,6 +309,7 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; uint32_t next_state_id; + bool on_visible_node; bool ascending; bool halted; bool did_exceed_match_limit; @@ -1163,12 +1164,12 @@ static void ts_query__perform_analysis( #ifdef DEBUG_ANALYZE_QUERY printf("Iteration: %u. Final step indices:", iteration); - for (unsigned j = 0; j < final_step_indices->size; j++) { - printf(" %4u", final_step_indices->contents[j]); + for (unsigned j = 0; j < analysis->final_step_indices.size; j++) { + printf(" %4u", analysis->final_step_indices.contents[j]); } printf("\n"); - for (unsigned j = 0; j < states->size; j++) { - AnalysisState *state = states->contents[j]; + for (unsigned j = 0; j < analysis->states.size; j++) { + AnalysisState *state = analysis->states.contents[j]; printf(" %3u: step: %u, stack: [", j, state->step_index); for (unsigned k = 0; k < state->depth; k++) { printf( @@ -1710,7 +1711,10 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } #ifdef DEBUG_ANALYZE_QUERY - printf("\nWalk states for %s:\n", ts_language_symbol_name(self->language, states.contents[0]->stack[0].parent_symbol)); + printf( + "\nWalk states for %s:\n", + ts_language_symbol_name(self->language, analysis.states.contents[0]->stack[0].parent_symbol) + ); #endif analysis.did_abort = false; @@ -1911,7 +1915,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { #ifdef DEBUG_ANALYZE_QUERY if (self->repeat_symbols_with_rootless_patterns.size > 0) { printf("\nRepetition symbols with rootless patterns:\n"); - printf("aborted analysis: %d\n", analyzer.did_abort); + printf("aborted analysis: %d\n", analysis.did_abort); for (unsigned i = 0; i < self->repeat_symbols_with_rootless_patterns.size; i++) { TSSymbol symbol = self->repeat_symbols_with_rootless_patterns.contents[i]; printf(" %u, %s\n", symbol, ts_language_symbol_name(self->language, symbol)); @@ -2986,6 +2990,7 @@ void ts_query_cursor_exec( array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); capture_list_pool_reset(&self->capture_list_pool); + self->on_visible_node = true; self->next_state_id = 0; self->depth = 0; self->ascending = false; @@ -3320,6 +3325,50 @@ static QueryState *ts_query_cursor__copy_state( return &self->states.contents[state_index + 1]; } +static inline bool ts_query_cursor__should_descend_outside_of_range( + TSQueryCursor *self +) { + // If there are in-progress matches whose remaining steps occur + // deeper in the tree, then descend. + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i];; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if ( + next_step->depth != PATTERN_DONE_MARKER && + state->start_depth + next_step->depth > self->depth + ) { + return true; + } + } + + // If the current node is hidden, then a non-rooted pattern might match + // one if its roots inside of this node, and match another of its roots + // as part of a sibling node, so we may need to descend. + if (!self->on_visible_node) { + // Descending into a repetition node outside of the range can be + // expensive, because these nodes can have many visible children. + // Avoid descending into repetition nodes unless we have already + // determined that this query can match rootless patterns inside + // of this type of repetition node. + Subtree subtree = ts_tree_cursor_current_subtree(&self->cursor); + if (ts_subtree_is_repetition(subtree)) { + bool exists; + uint32_t index; + array_search_sorted_by( + &self->query->repeat_symbols_with_rootless_patterns,, + ts_subtree_symbol(subtree), + &index, + &exists + ); + return exists; + } + + return true; + } + + return false; +} + // Walk the tree, processing patterns until at least one pattern finishes, // If one or more patterns finish, return `true` and store their states in the // `finished_states` array. Multiple patterns can finish on the same node. If @@ -3351,219 +3400,49 @@ static inline bool ts_query_cursor__advance( ); // Leave this node by stepping to its next sibling or to its parent. - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { - LOG("halt at root\n"); - self->halted = true; - } - - // After leaving a node, remove any states that cannot make further progress. - uint32_t deleted_count = 0; - for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - - // If a state completed its pattern inside of this node, but was deferred from finishing - // in order to search for longer matches, mark it as finished. - if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || self->halted) { - LOG(" finish pattern %u\n", state->pattern_index); - array_push(&self->finished_states, *state); - did_match = true; - deleted_count++; - continue; - } - } - - // If a state needed to match something within this node, then remove that state - // as it has failed to match. - else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { - LOG( - " failed to match. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - deleted_count++; - continue; - } - - if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; - } - } - self->states.size -= deleted_count; - } - - // Enter a new node. - else { - // Get the properties of the current node. - TSNode node = ts_tree_cursor_current_node(&self->cursor); - TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - bool has_later_siblings; - bool has_later_named_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = 0; - TSSymbol supertypes[8] = {0}; - unsigned supertype_count = 8; - ts_tree_cursor_current_status( - &self->cursor, - &field_id, - &has_later_siblings, - &has_later_named_siblings, - &can_have_later_siblings_with_this_field, - supertypes, - &supertype_count - ); - LOG( - "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", - self->depth, - ts_node_type(node), - ts_language_field_name_for_id(self->query->language, field_id), - ts_node_start_point(node).row, - self->states.size, - self->finished_states.size - ); - - bool parent_intersects_range = ts_node_is_null(parent_node) || ( - ts_node_end_byte(parent_node) > self->start_byte && - ts_node_start_byte(parent_node) < self->end_byte && - point_gt(ts_node_end_point(parent_node), self->start_point) && - point_lt(ts_node_start_point(parent_node), self->end_point) - ); - bool node_intersects_range = parent_intersects_range && ( - ts_node_end_byte(node) > self->start_byte && - ts_node_start_byte(node) < self->end_byte && - point_gt(ts_node_end_point(node), self->start_point) && - point_lt(ts_node_start_point(node), self->end_point) - ); - bool node_is_error = symbol == ts_builtin_sym_error; - bool parent_is_error = - !ts_node_is_null(parent_node) && - ts_node_symbol(parent_node) == ts_builtin_sym_error; - - // Add new states for any patterns whose root node is a wildcard. - if (!node_is_error) { - for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - if ( - (pattern->is_rooted ? - node_intersects_range : - (parent_intersects_range && !parent_is_error)) && - (!step->field || field_id == step->field) && - (!step->supertype_symbol || supertype_count > 0) - ) { - ts_query_cursor__add_state(self, pattern); - } - } - } - - // Add new states for any patterns whose root node matches this node. - unsigned i; - if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - do { - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if ( - (pattern->is_rooted ? - node_intersects_range : - (parent_intersects_range && !parent_is_error)) && - (!step->field || field_id == step->field) - ) { - ts_query_cursor__add_state(self, pattern); - } - - // Advance to the next pattern whose root node matches this node. - i++; - if (i == self->query->pattern_map.size) break; - pattern = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[pattern->step_index]; - } while (step->symbol == symbol); - } - - // Update all of the in-progress states with current node. - for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - state->has_in_progress_alternatives = false; - copy_count = 0; - - // Check that the node matches all of the criteria for the next - // step of the pattern. - if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; - - // Determine if this node matches this step of the pattern, and also - // if this node can have later siblings that match this step of the - // pattern. - bool node_does_match = false; - if (step->symbol == WILDCARD_SYMBOL) { - node_does_match = !node_is_error && (is_named || !step->is_named); - } else { - node_does_match = symbol == step->symbol; - } - bool later_sibling_can_match = has_later_siblings; - if ((step->is_immediate && is_named) || state->seeking_immediate_match) { - later_sibling_can_match = false; - } - if (step->is_last_child && has_later_named_siblings) { - node_does_match = false; - } - if (step->supertype_symbol) { - bool has_supertype = false; - for (unsigned j = 0; j < supertype_count; j++) { - if (supertypes[j] == step->supertype_symbol) { - has_supertype = true; - break; - } - } - if (!has_supertype) node_does_match = false; - } - if (step->field) { - if (step->field == field_id) { - if (!can_have_later_siblings_with_this_field) { - later_sibling_can_match = false; - } + switch (ts_tree_cursor_goto_next_sibling_internal(&self->cursor)) { + case TreeCursorStepVisible: + self->on_visible_node = true; + self->ascending = false; + break; + case TreeCursorStepHidden: + self->depth--; + self->on_visible_node = false; + self->ascending = false; + break; + default: + if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; } else { - node_does_match = false; + LOG("halt at root\n"); + self->halted = true; } - } + } - if (step->negated_field_list_id) { - TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; - for (;;) { - TSFieldId negated_field_id = *negated_field_ids; - if (negated_field_id) { - negated_field_ids++; - if (ts_node_child_by_field_id(node, negated_field_id).id) { - node_does_match = false; - break; - } - } else { - break; + if (self->on_visible_node) { + // After leaving a node, remove any states that cannot make further progress. + uint32_t deleted_count = 0; + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + // If a state completed its pattern inside of this node, but was deferred from finishing + // in order to search for longer matches, mark it as finished. + if (step->depth == PATTERN_DONE_MARKER) { + if (state->start_depth > self->depth || self->halted) { + LOG(" finish pattern %u\n", state->pattern_index); + array_push(&self->finished_states, *state); + did_match = true; + deleted_count++; + continue; } } - } - // Remove states immediately if it is ever clear that they cannot match. - if (!node_does_match) { - if (!later_sibling_can_match) { + // If a state needed to match something within this node, then remove that state + // as it has failed to match. + else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { LOG( - " discard state. pattern:%u, step:%u\n", + " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); @@ -3571,249 +3450,428 @@ static inline bool ts_query_cursor__advance( &self->capture_list_pool, state->capture_list_id ); - array_erase(&self->states, i); - i--; + deleted_count++; + continue; } - continue; - } - // Some patterns can match their root node in multiple ways, capturing different - // children. If this pattern step could match later children within the same - // parent, then this query state cannot simply be updated in place. It must be - // split into two states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later siblings. - if (later_sibling_can_match && ( - step->contains_captures || - ts_query__step_is_fallible(self->query, state->step_index) - )) { - if (ts_query_cursor__copy_state(self, &state)) { - LOG( - " split state for capture. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - copy_count++; + if (deleted_count > 0) { + self->states.contents[i - deleted_count] = *state; } } + self->states.size -= deleted_count; + } + } - // If this pattern started with a wildcard, such that the pattern map - // actually points to the *second* step of the pattern, then check - // that the node has a parent, and capture the parent node if necessary. - if (state->needs_parent) { - TSNode parent = ts_tree_cursor_parent_node(&self->cursor); - if (ts_node_is_null(parent)) { - LOG(" missing parent node\n"); - state->dead = true; - } else { - state->needs_parent = false; - QueryStep *skipped_wildcard_step = step; - do { - skipped_wildcard_step--; - } while ( - skipped_wildcard_step->is_dead_end || - skipped_wildcard_step->is_pass_through || - skipped_wildcard_step->depth > 0 - ); - if (skipped_wildcard_step->capture_ids[0] != NONE) { - LOG(" capture wildcard parent\n"); - ts_query_cursor__capture( - self, - state, - skipped_wildcard_step, - parent - ); - } - } - } + // Enter a new node. + else { + // Get the properties of the current node. + TSNode node = ts_tree_cursor_current_node(&self->cursor); + TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); - // If the current node is captured in this pattern, add it to the capture list. - if (step->capture_ids[0] != NONE) { - ts_query_cursor__capture(self, state, step, node); - } + bool parent_precedes_range = !ts_node_is_null(parent_node) && ( + ts_node_end_byte(parent_node) <= self->start_byte || + point_lte(ts_node_end_point(parent_node), self->start_point) + ); + bool parent_follows_range = !ts_node_is_null(parent_node) && ( + ts_node_start_byte(parent_node) >= self->end_byte || + point_gte(ts_node_start_point(parent_node), self->end_point) + ); + bool node_precedes_range = parent_precedes_range || ( + ts_node_end_byte(node) <= self->start_byte || + point_lte(ts_node_end_point(node), self->start_point) + ); + bool node_follows_range = parent_follows_range || ( + ts_node_start_byte(node) >= self->end_byte || + point_gte(ts_node_start_point(node), self->end_point) + ); + bool parent_intersects_range = !parent_precedes_range && !parent_follows_range; + bool node_intersects_range = !node_precedes_range && !node_follows_range; - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } - - // Advance this state to the next step of its pattern. - state->step_index++; - state->seeking_immediate_match = false; + if (self->on_visible_node) { + TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); + bool has_later_siblings; + bool has_later_named_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = 0; + TSSymbol supertypes[8] = {0}; + unsigned supertype_count = 8; + ts_tree_cursor_current_status( + &self->cursor, + &field_id, + &has_later_siblings, + &has_later_named_siblings, + &can_have_later_siblings_with_this_field, + supertypes, + &supertype_count + ); LOG( - " advance state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index + "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", + self->depth, + ts_node_type(node), + ts_language_field_name_for_id(self->query->language, field_id), + ts_node_start_point(node).row, + self->states.size, + self->finished_states.size ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; + bool node_is_error = symbol == ts_builtin_sym_error; + bool parent_is_error = + !ts_node_is_null(parent_node) && + ts_node_symbol(parent_node) == ts_builtin_sym_error; - // If this state's next step has an alternative step, then copy the state in order - // to pursue both alternatives. The alternative step itself may have an alternative, - // so this is an interactive process. - unsigned end_index = i + 1; - for (unsigned j = i; j < end_index; j++) { - QueryState *state = &self->states.contents[j]; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->alternative_index != NONE) { - // A "dead-end" step exists only to add a non-sequential jump into the step sequence, - // via its alternative index. When a state reaches a dead-end step, it jumps straight - // to the step's alternative. - if (next_step->is_dead_end) { - state->step_index = next_step->alternative_index; - j--; - continue; + // Add new states for any patterns whose root node is a wildcard. + if (!node_is_error) { + for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { + PatternEntry *pattern = &self->query->pattern_map.contents[i]; + + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + if ( + (pattern->is_rooted ? + node_intersects_range : + (parent_intersects_range && !parent_is_error)) && + (!step->field || field_id == step->field) && + (!step->supertype_symbol || supertype_count > 0) + ) { + ts_query_cursor__add_state(self, pattern); + } + } + } + + // Add new states for any patterns whose root node matches this node. + unsigned i; + if (ts_query__pattern_map_search(self->query, symbol, &i)) { + PatternEntry *pattern = &self->query->pattern_map.contents[i]; + + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + do { + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if ( + (pattern->is_rooted ? + node_intersects_range : + (parent_intersects_range && !parent_is_error)) && + (!step->field || field_id == step->field) + ) { + ts_query_cursor__add_state(self, pattern); } - // A "pass-through" step exists only to add a branch into the step sequence, - // via its alternative_index. When a state reaches a pass-through step, it splits - // in order to process the alternative step, and then it advances to the next step. - if (next_step->is_pass_through) { - state->step_index++; - j--; - } + // Advance to the next pattern whose root node matches this node. + i++; + if (i == self->query->pattern_map.size) break; + pattern = &self->query->pattern_map.contents[i]; + step = &self->query->steps.contents[pattern->step_index]; + } while (step->symbol == symbol); + } - QueryState *copy = ts_query_cursor__copy_state(self, &state); - if (copy) { + // Update all of the in-progress states with current node. + for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + state->has_in_progress_alternatives = false; + copy_count = 0; + + // Check that the node matches all of the criteria for the next + // step of the pattern. + if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; + + // Determine if this node matches this step of the pattern, and also + // if this node can have later siblings that match this step of the + // pattern. + bool node_does_match = false; + if (step->symbol == WILDCARD_SYMBOL) { + node_does_match = !node_is_error && (is_named || !step->is_named); + } else { + node_does_match = symbol == step->symbol; + } + bool later_sibling_can_match = has_later_siblings; + if ((step->is_immediate && is_named) || state->seeking_immediate_match) { + later_sibling_can_match = false; + } + if (step->is_last_child && has_later_named_siblings) { + node_does_match = false; + } + if (step->supertype_symbol) { + bool has_supertype = false; + for (unsigned j = 0; j < supertype_count; j++) { + if (supertypes[j] == step->supertype_symbol) { + has_supertype = true; + break; + } + } + if (!has_supertype) node_does_match = false; + } + if (step->field) { + if (step->field == field_id) { + if (!can_have_later_siblings_with_this_field) { + later_sibling_can_match = false; + } + } else { + node_does_match = false; + } + } + + if (step->negated_field_list_id) { + TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; + for (;;) { + TSFieldId negated_field_id = *negated_field_ids; + if (negated_field_id) { + negated_field_ids++; + if (ts_node_child_by_field_id(node, negated_field_id).id) { + node_does_match = false; + break; + } + } else { + break; + } + } + } + + // Remove states immediately if it is ever clear that they cannot match. + if (!node_does_match) { + if (!later_sibling_can_match) { LOG( - " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", - copy->pattern_index, - copy->step_index, - next_step->alternative_index, - next_step->alternative_is_immediate, - capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + " discard state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->states, i); + i--; + } + continue; + } + + // Some patterns can match their root node in multiple ways, capturing different + // children. If this pattern step could match later children within the same + // parent, then this query state cannot simply be updated in place. It must be + // split into two states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later siblings. + if (later_sibling_can_match && ( + step->contains_captures || + ts_query__step_is_fallible(self->query, state->step_index) + )) { + if (ts_query_cursor__copy_state(self, &state)) { + LOG( + " split state for capture. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index ); - end_index++; copy_count++; - copy->step_index = next_step->alternative_index; - if (next_step->alternative_is_immediate) { - copy->seeking_immediate_match = true; + } + } + + // If this pattern started with a wildcard, such that the pattern map + // actually points to the *second* step of the pattern, then check + // that the node has a parent, and capture the parent node if necessary. + if (state->needs_parent) { + TSNode parent = ts_tree_cursor_parent_node(&self->cursor); + if (ts_node_is_null(parent)) { + LOG(" missing parent node\n"); + state->dead = true; + } else { + state->needs_parent = false; + QueryStep *skipped_wildcard_step = step; + do { + skipped_wildcard_step--; + } while ( + skipped_wildcard_step->is_dead_end || + skipped_wildcard_step->is_pass_through || + skipped_wildcard_step->depth > 0 + ); + if (skipped_wildcard_step->capture_ids[0] != NONE) { + LOG(" capture wildcard parent\n"); + ts_query_cursor__capture( + self, + state, + skipped_wildcard_step, + parent + ); + } + } + } + + // If the current node is captured in this pattern, add it to the capture list. + if (step->capture_ids[0] != NONE) { + ts_query_cursor__capture(self, state, step, node); + } + + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } + + // Advance this state to the next step of its pattern. + state->step_index++; + state->seeking_immediate_match = false; + LOG( + " advance state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; + + // If this state's next step has an alternative step, then copy the state in order + // to pursue both alternatives. The alternative step itself may have an alternative, + // so this is an interactive process. + unsigned end_index = i + 1; + for (unsigned j = i; j < end_index; j++) { + QueryState *state = &self->states.contents[j]; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->alternative_index != NONE) { + // A "dead-end" step exists only to add a non-sequential jump into the step sequence, + // via its alternative index. When a state reaches a dead-end step, it jumps straight + // to the step's alternative. + if (next_step->is_dead_end) { + state->step_index = next_step->alternative_index; + j--; + continue; + } + + // A "pass-through" step exists only to add a branch into the step sequence, + // via its alternative_index. When a state reaches a pass-through step, it splits + // in order to process the alternative step, and then it advances to the next step. + if (next_step->is_pass_through) { + state->step_index++; + j--; + } + + QueryState *copy = ts_query_cursor__copy_state(self, &state); + if (copy) { + LOG( + " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", + copy->pattern_index, + copy->step_index, + next_step->alternative_index, + next_step->alternative_is_immediate, + capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + ); + end_index++; + copy_count++; + copy->step_index = next_step->alternative_index; + if (next_step->alternative_is_immediate) { + copy->seeking_immediate_match = true; + } + } + } + } + } + + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } + + // Enfore the longest-match criteria. When a query pattern contains optional or + // repeated nodes, this is necessary to avoid multiple redundant states, where + // one state has a strict subset of another state's captures. + bool did_remove = false; + for (unsigned j = i + 1; j < self->states.size; j++) { + QueryState *other_state = &self->states.contents[j]; + + // Query states are kept in ascending order of start_depth and pattern_index. + // Since the longest-match criteria is only used for deduping matches of the same + // pattern and root node, we only need to perform pairwise comparisons within a + // small slice of the states array. + if ( + other_state->start_depth != state->start_depth || + other_state->pattern_index != state->pattern_index + ) break; + + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; + } + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + i--; + did_remove = true; + break; + } + state->has_in_progress_alternatives = true; + } + } + + // If the state is at the end of its pattern, remove it from the list + // of in-progress states and add it to the list of finished states. + if (!did_remove) { + LOG( + " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", + state->pattern_index, + state->start_depth, + state->step_index, + capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size + ); + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->depth == PATTERN_DONE_MARKER) { + if (state->has_in_progress_alternatives) { + LOG(" defer finishing pattern %u\n", state->pattern_index); + } else { + LOG(" finish pattern %u\n", state->pattern_index); + array_push(&self->finished_states, *state); + array_erase(&self->states, (uint32_t)(state - self->states.contents)); + did_match = true; + i--; } } } } } - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } - - // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necessary to avoid multiple redundant states, where - // one state has a strict subset of another state's captures. - bool did_remove = false; - for (unsigned j = i + 1; j < self->states.size; j++) { - QueryState *other_state = &self->states.contents[j]; - - // Query states are kept in ascending order of start_depth and pattern_index. - // Since the longest-match criteria is only used for deduping matches of the same - // pattern and root node, we only need to perform pairwise comparisons within a - // small slice of the states array. - if ( - other_state->start_depth != state->start_depth || - other_state->pattern_index != state->pattern_index - ) break; - - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; - } - other_state->has_in_progress_alternatives = true; - } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - i--; - did_remove = true; - break; - } - state->has_in_progress_alternatives = true; - } - } - - // If the state is at the end of its pattern, remove it from the list - // of in-progress states and add it to the list of finished states. - if (!did_remove) { - LOG( - " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", - state->pattern_index, - state->start_depth, - state->step_index, - capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size - ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->depth == PATTERN_DONE_MARKER) { - if (state->has_in_progress_alternatives) { - LOG(" defer finishing pattern %u\n", state->pattern_index); - } else { - LOG(" finish pattern %u\n", state->pattern_index); - array_push(&self->finished_states, *state); - array_erase(&self->states, (uint32_t)(state - self->states.contents)); - did_match = true; - i--; - } - } - } - } - - // When the current node ends prior to the desired start offset, - // only descend for the purpose of continuing in-progress matches. - bool has_in_progress_matches = false; - if (!node_intersects_range) { - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i];; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if ( - next_step->depth != PATTERN_DONE_MARKER && - state->start_depth + next_step->depth > self->depth - ) { - has_in_progress_matches = true; + bool should_descend = + node_intersects_range || + ts_query_cursor__should_descend_outside_of_range(self); + if (should_descend) { + switch (ts_tree_cursor_goto_first_child_internal(&self->cursor)) { + case TreeCursorStepVisible: + self->depth++; + self->on_visible_node = true; + continue; + case TreeCursorStepHidden: + self->on_visible_node = false; + continue; + default: break; - } } } - bool should_descend = node_intersects_range || has_in_progress_matches; - if (!should_descend) { - LOG( - " not descending. node end byte: %u, start byte: %u\n", - ts_node_end_byte(node), - self->start_byte - ); - } - - if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) { - self->depth++; - } else { - self->ascending = true; - } + self->ascending = true; } } } diff --git a/lib/src/subtree.h b/lib/src/subtree.h index 8456d2f1..a0e838eb 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -291,6 +291,12 @@ static inline uint32_t ts_subtree_repeat_depth(Subtree self) { return self.data.is_inline ? 0 : self.ptr->repeat_depth; } +static inline uint32_t ts_subtree_is_repetition(Subtree self) { + return self.data.is_inline + ? 0 + : !self.ptr->named && !self.ptr->visible && self.ptr->child_count != 0; +} + static inline uint32_t ts_subtree_node_count(Subtree self) { return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count; } From bd63fb2a0d837bb5ae254ce7749d63c58ebac945 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 15 Feb 2023 14:03:36 -0800 Subject: [PATCH 6/9] Tweak query tests --- Cargo.lock | 7 +++++ cli/Cargo.toml | 3 ++- cli/src/tests/query_test.rs | 51 ++++++++++++++++++++++++------------- lib/Cargo.toml | 2 +- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f5c4e7e4..ca773788 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -584,6 +584,7 @@ dependencies = [ "tree-sitter-highlight", "tree-sitter-loader", "tree-sitter-tags", + "unindent", "walkdir", "webbrowser", "which", @@ -647,6 +648,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +[[package]] +name = "unindent" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa30f5ea51ff7edfc797c6d3f9ec8cbd8cfedef5371766b7181d33977f4814f" + [[package]] name = "utf8-width" version = "0.1.6" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 5403075d..77cf52e4 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -3,7 +3,7 @@ name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" version = "0.20.7" authors = ["Max Brunsfeld "] -edition = "2018" +edition = "2021" license = "MIT" readme = "README.md" keywords = ["incremental", "parsing"] @@ -73,6 +73,7 @@ rand = "0.8" tempfile = "3" pretty_assertions = "0.7.2" ctor = "0.1" +unindent = "0.2" [build-dependencies] toml = "0.5" diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 31cb8035..63dea5a6 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -2,6 +2,7 @@ use super::helpers::{ allocations, fixtures::get_language, query_helpers::{Match, Pattern}, + ITERATION_COUNT, }; use lazy_static::lazy_static; use rand::{prelude::StdRng, SeedableRng}; @@ -10,6 +11,7 @@ use tree_sitter::{ CaptureQuantifier, Language, Node, Parser, Point, Query, QueryCapture, QueryCursor, QueryError, QueryErrorKind, QueryMatch, QueryPredicate, QueryPredicateArg, QueryProperty, }; +use unindent::Unindent; lazy_static! { static ref EXAMPLE_FILTER: Option = env::var("TREE_SITTER_TEST_EXAMPLE_FILTER").ok(); @@ -1920,20 +1922,28 @@ fn test_query_matches_within_point_range() { let language = get_language("javascript"); let query = Query::new(language, "(identifier) @element").unwrap(); - let source = "[a, b,\n c, d,\n e, f,\n g]"; + let source = " + [ + a, b, + c, d, + e, f, + g, h, + i, j, + k, l, + ] + " + .unindent(); let mut parser = Parser::new(); parser.set_language(language).unwrap(); let tree = parser.parse(&source, None).unwrap(); - let mut cursor = QueryCursor::new(); let matches = cursor - .set_point_range(Point::new(0, 0)..Point::new(1, 3)) + .set_point_range(Point::new(1, 0)..Point::new(2, 3)) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( - collect_matches(matches, &query, source), + collect_matches(matches, &query, &source), &[ (0, vec![("element", "a")]), (0, vec![("element", "b")]), @@ -1942,11 +1952,10 @@ fn test_query_matches_within_point_range() { ); let matches = cursor - .set_point_range(Point::new(1, 0)..Point::new(2, 3)) + .set_point_range(Point::new(2, 0)..Point::new(3, 3)) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( - collect_matches(matches, &query, source), + collect_matches(matches, &query, &source), &[ (0, vec![("element", "c")]), (0, vec![("element", "d")]), @@ -1954,16 +1963,19 @@ fn test_query_matches_within_point_range() { ] ); + // Zero end point is treated like no end point. let matches = cursor - .set_point_range(Point::new(2, 1)..Point::new(0, 0)) + .set_point_range(Point::new(4, 1)..Point::new(0, 0)) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( - collect_matches(matches, &query, source), + collect_matches(matches, &query, &source), &[ - (0, vec![("element", "e")]), - (0, vec![("element", "f")]), (0, vec![("element", "g")]), + (0, vec![("element", "h")]), + (0, vec![("element", "i")]), + (0, vec![("element", "j")]), + (0, vec![("element", "k")]), + (0, vec![("element", "l")]), ] ); }); @@ -3634,17 +3646,22 @@ fn test_query_random() { .parse(include_str!("helpers/query_helpers.rs"), None) .unwrap(); - // let start_seed = *SEED; let start_seed = 0; + let end_seed = start_seed + *ITERATION_COUNT; - for i in 0..100 { - let seed = (start_seed + i) as u64; + for seed in start_seed..(start_seed + end_seed) { + let seed = seed as u64; let mut rand = StdRng::seed_from_u64(seed); let (pattern_ast, _) = Pattern::random_pattern_in_tree(&pattern_tree, &mut rand); let pattern = pattern_ast.to_string(); let expected_matches = pattern_ast.matches_in_tree(&test_tree); - let query = Query::new(language, &pattern).unwrap(); + let query = match Query::new(language, &pattern) { + Ok(query) => query, + Err(e) => { + panic!("failed to build query for pattern {pattern} - {e}. seed: {seed}"); + } + }; let mut actual_matches = cursor .matches( &query, diff --git a/lib/Cargo.toml b/lib/Cargo.toml index d096efdc..c2d35685 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -3,7 +3,7 @@ name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" version = "0.20.9" authors = ["Max Brunsfeld "] -edition = "2018" +edition = "2021" license = "MIT" readme = "binding_rust/README.md" keywords = ["incremental", "parsing"] From 40703f110c7f16650b686fc4c56ab128cf61e449 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 15 Feb 2023 14:40:36 -0800 Subject: [PATCH 7/9] Fix bug in maintenance of query cursor's tree depth --- cli/src/tests/query_test.rs | 3 --- lib/src/query.c | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 63dea5a6..c691df30 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1876,7 +1876,6 @@ fn test_query_matches_within_byte_range() { cursor .set_byte_range(0..8) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( collect_matches(matches, &query, source), &[ @@ -1890,7 +1889,6 @@ fn test_query_matches_within_byte_range() { cursor .set_byte_range(5..15) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( collect_matches(matches, &query, source), &[ @@ -1904,7 +1902,6 @@ fn test_query_matches_within_byte_range() { cursor .set_byte_range(12..0) .matches(&query, tree.root_node(), source.as_bytes()); - assert_eq!( collect_matches(matches, &query, source), &[ diff --git a/lib/src/query.c b/lib/src/query.c index 04a59f9a..b2450ce2 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -3393,21 +3393,28 @@ static inline bool ts_query_cursor__advance( // Exit the current node. if (self->ascending) { - LOG( - "leave node. depth:%u, type:%s\n", - self->depth, - ts_node_type(ts_tree_cursor_current_node(&self->cursor)) - ); + if (self->on_visible_node) { + LOG( + "leave node. depth:%u, type:%s\n", + self->depth, + ts_node_type(ts_tree_cursor_current_node(&self->cursor)) + ); + } // Leave this node by stepping to its next sibling or to its parent. switch (ts_tree_cursor_goto_next_sibling_internal(&self->cursor)) { case TreeCursorStepVisible: - self->on_visible_node = true; + if (!self->on_visible_node) { + self->depth++; + self->on_visible_node = true; + } self->ascending = false; break; case TreeCursorStepHidden: - self->depth--; - self->on_visible_node = false; + if (self->on_visible_node) { + self->depth--; + self->on_visible_node = false; + } self->ascending = false; break; default: @@ -3467,7 +3474,6 @@ static inline bool ts_query_cursor__advance( // Get the properties of the current node. TSNode node = ts_tree_cursor_current_node(&self->cursor); TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); - bool parent_precedes_range = !ts_node_is_null(parent_node) && ( ts_node_end_byte(parent_node) <= self->start_byte || point_lte(ts_node_end_point(parent_node), self->start_point) From 837899e456202c6d112679c03e7e989451973a6d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 15 Feb 2023 18:24:07 -0800 Subject: [PATCH 8/9] Add API for checking if a pattern in a query is non-local --- cli/src/tests/query_test.rs | 62 +++++++++++++++++++++++++++++++++++ lib/binding_rust/bindings.rs | 3 ++ lib/binding_rust/lib.rs | 8 ++++- lib/include/tree_sitter/api.h | 27 ++++++++++----- lib/src/query.c | 24 ++++++++++++-- 5 files changed, 112 insertions(+), 12 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index c691df30..e99fe06e 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -4084,6 +4084,68 @@ fn test_query_is_pattern_rooted() { }); } +#[test] +fn test_query_is_pattern_non_local() { + struct Row { + description: &'static str, + pattern: &'static str, + is_non_local: bool, + } + + let rows = [ + Row { + description: "simple token", + pattern: r#"(identifier)"#, + is_non_local: false, + }, + Row { + description: "siblings that can occur in an argument list", + pattern: r#"((identifier) (identifier))"#, + is_non_local: true, + }, + Row { + description: "siblings that can occur in a statement block", + pattern: r#"((return_statement) (return_statement))"#, + is_non_local: true, + }, + Row { + description: "siblings that can occur in a source file", + pattern: r#"((function_definition) (class_definition))"#, + is_non_local: true, + }, + Row { + description: "siblings that can't occur in any repetition", + pattern: r#"("{" "}")"#, + is_non_local: false, + }, + ]; + + allocations::record(|| { + eprintln!(""); + + let language = get_language("python"); + for row in &rows { + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !row.description.contains(filter.as_str()) { + continue; + } + } + eprintln!(" query example: {:?}", row.description); + let query = Query::new(language, row.pattern).unwrap(); + assert_eq!( + query.is_pattern_non_local(0), + row.is_non_local, + "Description: {}, Pattern: {:?}", + row.description, + row.pattern + .split_ascii_whitespace() + .collect::>() + .join(" "), + ) + } + }); +} + #[test] fn test_capture_quantifiers() { struct Row { diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index 4591a380..be117f83 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -677,6 +677,9 @@ extern "C" { length: *mut u32, ) -> *const TSQueryPredicateStep; } +extern "C" { + pub fn ts_query_is_pattern_non_local(self_: *const TSQuery, pattern_index: u32) -> bool; +} extern "C" { pub fn ts_query_is_pattern_rooted(self_: *const TSQuery, pattern_index: u32) -> bool; } diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 6f044cca..579bf8e2 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -1736,11 +1736,17 @@ impl Query { } /// Check if a given pattern within a query has a single root node. - #[doc(alias = "ts_query_is_pattern_guaranteed_at_step")] + #[doc(alias = "ts_query_is_pattern_rooted")] pub fn is_pattern_rooted(&self, index: usize) -> bool { unsafe { ffi::ts_query_is_pattern_rooted(self.ptr.as_ptr(), index as u32) } } + /// Check if a given pattern within a query has a single root node. + #[doc(alias = "ts_query_is_pattern_non_local")] + pub fn is_pattern_non_local(&self, index: usize) -> bool { + unsafe { ffi::ts_query_is_pattern_non_local(self.ptr.as_ptr(), index as u32) } + } + /// Check if a given step in a query is 'definite'. /// /// A query step is 'definite' if its parent pattern will be guaranteed to match diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index 5b48cf60..edc1c36a 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -750,15 +750,26 @@ const TSQueryPredicateStep *ts_query_predicates_for_pattern( uint32_t *length ); -bool ts_query_is_pattern_rooted( - const TSQuery *self, - uint32_t pattern_index -); +/* + * Check if the given pattern in the query has a single root node. + */ +bool ts_query_is_pattern_rooted(const TSQuery *self, uint32_t pattern_index); -bool ts_query_is_pattern_guaranteed_at_step( - const TSQuery *self, - uint32_t byte_offset -); +/* + * Check if the given pattern in the query is 'non local'. + * + * A non-local pattern has multiple root nodes and can match within a + * repeating sequence of nodes, as specified by the grammar. Non-local + * patterns disable certain optimizations that would otherwise be possible + * when executing a query on a specific range of a syntax tree. + */ +bool ts_query_is_pattern_non_local(const TSQuery *self, uint32_t pattern_index); + +/* + * Check if a given pattern is guaranteed to match once a given step is reached. + * The step is specified by its byte offset in the query's source code. + */ +bool ts_query_is_pattern_guaranteed_at_step(const TSQuery *self, uint32_t byte_offset); /** * Get the name and length of one of the query's captures, or one of the diff --git a/lib/src/query.c b/lib/src/query.c index b2450ce2..cfe11438 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -146,6 +146,7 @@ typedef struct { Slice steps; Slice predicate_steps; uint32_t start_byte; + bool is_non_local; } QueryPattern; typedef struct { @@ -1455,7 +1456,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { if (!pattern->is_rooted) { QueryStep *step = &self->steps.contents[pattern->step_index]; if (step->symbol != WILDCARD_SYMBOL) { - array_push(&non_rooted_pattern_start_steps, pattern->step_index); + array_push(&non_rooted_pattern_start_steps, i); } } } @@ -1868,7 +1869,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // prevent certain optimizations with range restrictions. analysis.did_abort = false; for (uint32_t i = 0; i < non_rooted_pattern_start_steps.size; i++) { - uint16_t step_index = non_rooted_pattern_start_steps.contents[i]; + uint16_t pattern_entry_index = non_rooted_pattern_start_steps.contents[i]; + PatternEntry *pattern_entry = &self->pattern_map.contents[pattern_entry_index]; analysis_state_set__clear(&analysis.states, &analysis.state_pool); analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); @@ -1880,7 +1882,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { for (uint32_t k = 0; k < subgraph->start_states.size; k++) { TSStateId parse_state = subgraph->start_states.contents[k]; analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { - .step_index = step_index, + .step_index = pattern_entry->step_index, .stack = { [0] = { .parse_state = parse_state, @@ -1906,6 +1908,10 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { &analysis ); + if (analysis.finished_parent_symbols.size > 0) { + self->patterns.contents[pattern_entry->pattern_index].is_non_local = true; + } + for (unsigned k = 0; k < analysis.finished_parent_symbols.size; k++) { TSSymbol symbol = analysis.finished_parent_symbols.contents[k]; array_insert_sorted_by(&self->repeat_symbols_with_rootless_patterns, , symbol); @@ -2697,6 +2703,7 @@ TSQuery *ts_query_new( .steps = (Slice) {.offset = start_step_index}, .predicate_steps = (Slice) {.offset = start_predicate_step_index}, .start_byte = stream_offset(&stream), + .is_non_local = false, })); CaptureQuantifiers capture_quantifiers = capture_quantifiers_new(); *error_type = ts_query__parse_pattern(self, &stream, 0, false, &capture_quantifiers); @@ -2876,6 +2883,17 @@ bool ts_query_is_pattern_rooted( return true; } +bool ts_query_is_pattern_non_local( + const TSQuery *self, + uint32_t pattern_index +) { + if (pattern_index < self->patterns.size) { + return self->patterns.contents[pattern_index].is_non_local; + } else { + return false; + } +} + bool ts_query_is_pattern_guaranteed_at_step( const TSQuery *self, uint32_t byte_offset From 8dcf8517399d83ad7cfd2f046c8ea441827ebde2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 16 Feb 2023 12:03:51 -0800 Subject: [PATCH 9/9] Add unit test for querying within a range of a long top-level repetition --- cli/src/tests/query_test.rs | 90 ++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index e99fe06e..7d01c26e 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -2219,6 +2219,57 @@ fn test_query_captures_within_byte_range_assigned_after_iterating() { }); } +#[test] +fn test_query_matches_within_range_of_long_repetition() { + allocations::record(|| { + let language = get_language("rust"); + let query = Query::new( + language, + " + (function_item name: (identifier) @fn-name) + ", + ) + .unwrap(); + + let source = " + fn zero() {} + fn one() {} + fn two() {} + fn three() {} + fn four() {} + fn five() {} + fn six() {} + fn seven() {} + fn eight() {} + fn nine() {} + fn ten() {} + fn eleven() {} + fn twelve() {} + " + .unindent(); + + let mut parser = Parser::new(); + let mut cursor = QueryCursor::new(); + + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + + let matches = cursor + .set_point_range(Point::new(8, 0)..Point::new(20, 0)) + .matches(&query, tree.root_node(), source.as_bytes()); + assert_eq!( + collect_matches(matches, &query, &source), + &[ + (0, vec![("fn-name", "eight")]), + (0, vec![("fn-name", "nine")]), + (0, vec![("fn-name", "ten")]), + (0, vec![("fn-name", "eleven")]), + (0, vec![("fn-name", "twelve")]), + ] + ); + }); +} + #[test] fn test_query_matches_different_queries_same_cursor() { allocations::record(|| { @@ -4089,6 +4140,7 @@ fn test_query_is_pattern_non_local() { struct Row { description: &'static str, pattern: &'static str, + language: Language, is_non_local: bool, } @@ -4096,26 +4148,61 @@ fn test_query_is_pattern_non_local() { Row { description: "simple token", pattern: r#"(identifier)"#, + language: get_language("python"), is_non_local: false, }, Row { description: "siblings that can occur in an argument list", pattern: r#"((identifier) (identifier))"#, + language: get_language("python"), is_non_local: true, }, Row { description: "siblings that can occur in a statement block", pattern: r#"((return_statement) (return_statement))"#, + language: get_language("python"), is_non_local: true, }, Row { description: "siblings that can occur in a source file", pattern: r#"((function_definition) (class_definition))"#, + language: get_language("python"), is_non_local: true, }, Row { description: "siblings that can't occur in any repetition", pattern: r#"("{" "}")"#, + language: get_language("python"), + is_non_local: false, + }, + Row { + description: "siblings that can't occur in any repetition, wildcard root", + pattern: r#"(_ "{" "}") @foo"#, + language: get_language("javascript"), + is_non_local: false, + }, + Row { + description: "siblings that can occur in a class body, wildcard root", + pattern: r#"(_ (method_definition) (method_definition)) @foo"#, + language: get_language("javascript"), + is_non_local: true, + }, + Row { + description: "top-level repetitions that can occur in a class body", + pattern: r#"(method_definition)+ @foo"#, + language: get_language("javascript"), + is_non_local: true, + }, + Row { + description: "top-level repetitions that can occur in a statement block", + pattern: r#"(return_statement)+ @foo"#, + language: get_language("javascript"), + is_non_local: true, + }, + Row { + description: "rooted pattern that can occur in a statement block", + pattern: r#"(return_statement) @foo"#, + language: get_language("javascript"), is_non_local: false, }, ]; @@ -4123,7 +4210,6 @@ fn test_query_is_pattern_non_local() { allocations::record(|| { eprintln!(""); - let language = get_language("python"); for row in &rows { if let Some(filter) = EXAMPLE_FILTER.as_ref() { if !row.description.contains(filter.as_str()) { @@ -4131,7 +4217,7 @@ fn test_query_is_pattern_non_local() { } } eprintln!(" query example: {:?}", row.description); - let query = Query::new(language, row.pattern).unwrap(); + let query = Query::new(row.language, row.pattern).unwrap(); assert_eq!( query.is_pattern_non_local(0), row.is_non_local,