From 186b08381c9fa31128e3228c29bf4f1dfe1ee557 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 18 Sep 2019 11:37:49 -0700 Subject: [PATCH] Terminate failed query matches before descending whenever possible When iterating over captures, this prevents reasonable queries from forcing the tree cursor to buffer matches unnecessarily. --- cli/src/tests/query_test.rs | 61 ++++++++++++++++++ lib/src/query.c | 123 +++++++++++++++++++++--------------- lib/src/tree_cursor.c | 113 ++++++++++++++++++++------------- lib/src/tree_cursor.h | 2 +- 4 files changed, 203 insertions(+), 96 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 6074d0de..c64405e8 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1,6 +1,7 @@ use super::helpers::allocations; use super::helpers::fixtures::get_language; use tree_sitter::{Node, Parser, Query, QueryCursor, QueryError, QueryMatch}; +use std::fmt::Write; #[test] fn test_query_errors_on_invalid_syntax() { @@ -632,6 +633,66 @@ fn test_query_captures_with_duplicates() { }); } +#[test] +fn test_query_captures_with_many_results() { + allocations::record(|| { + let language = get_language("javascript"); + + // Search for key-value pairs whose values are anonymous functions. + let query = Query::new( + language, + r#" + (pair + key: * @method-def + value: (arrow_function)) + + ":" @colon + "," @comma + "#, + ) + .unwrap(); + + // The `pair` node for key `y` does not match any pattern, but inside of + // its value, it contains many other `pair` nodes that do match the pattern. + // The match for the *outer* pair should be terminated *before* descending into + // the object value, so that we can avoid needing to buffer all of the inner + // matches. + let method_count = 50; + let mut source = "x = { y: {\n".to_owned(); + for i in 0..method_count { + writeln!(&mut source, " method{}: $ => null,", i).unwrap(); + } + source.push_str("}};\n"); + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + let mut cursor = QueryCursor::new(); + + let captures = cursor.captures(&query, tree.root_node(), to_callback(&source)); + let captures = collect_captures(captures, &query, &source); + + assert_eq!(&captures[0..13], &[ + ("colon", ":"), + ("method-def", "method0"), + ("colon", ":"), + ("comma", ","), + ("method-def", "method1"), + ("colon", ":"), + ("comma", ","), + ("method-def", "method2"), + ("colon", ":"), + ("comma", ","), + ("method-def", "method3"), + ("colon", ":"), + ("comma", ","), + ]); + + // Ensure that we don't drop matches because of needing to buffer too many. + assert_eq!(captures.len(), 1 + 3 * method_count); + }); +} + #[test] fn test_query_pattern_after_source_byte() { let language = get_language("javascript"); diff --git a/lib/src/query.c b/lib/src/query.c index a3e69d83..da8c46be 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -138,11 +138,8 @@ static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; static const uint16_t MAX_STATE_COUNT = 32; -#ifdef DEBUG_QUERY -#define LOG printf -#else +// #define LOG printf #define LOG(...) -#endif /********** * Stream @@ -939,6 +936,8 @@ static QueryState *ts_query_cursor_copy_state( static inline bool ts_query_cursor__advance(TSQueryCursor *self) { do { if (self->ascending) { + LOG("leave node %s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); + // When leaving a node, remove any unfinished states whose next step // needed to match something within that node. uint32_t deleted_count = 0; @@ -948,7 +947,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if (state->start_depth + step->depth > self->depth) { LOG( - "fail state with pattern: %u, step: %u\n", + " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); @@ -963,10 +962,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } - if (deleted_count) { - LOG("failed %u of %u states\n", deleted_count, self->states.size); - self->states.size -= deleted_count; - } + self->states.size -= deleted_count; if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; @@ -976,8 +972,13 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { return false; } } else { - TSFieldId field_id = NONE; - bool field_occurs_in_later_sibling = false; + bool can_have_later_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = ts_tree_cursor_current_status( + &self->cursor, + &can_have_later_siblings, + &can_have_later_siblings_with_this_field + ); TSNode node = ts_tree_cursor_current_node(&self->cursor); TSSymbol symbol = ts_node_symbol(node); @@ -999,27 +1000,22 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { point_lte(self->end_point, ts_node_start_point(node)) ) return false; - LOG("enter node %s\n", ts_node_type(node)); + LOG( + "enter node %s. row:%u state_count:%u, finished_state_count: %u\n", + ts_node_type(node), + ts_node_start_point(node).row, + self->states.size, + self->finished_states.size + ); // Add new states for any patterns whose root node is a wildcard. for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { PatternEntry *slice = &self->query->pattern_map.contents[i]; QueryStep *step = &self->query->steps.contents[slice->step_index]; - if (step->field) { - // Compute the current field id if it is needed and has not yet - // been computed. - if (field_id == NONE) { - field_id = ts_tree_cursor_current_field_id_ext( - &self->cursor, - &field_occurs_in_later_sibling - ); - } - if (field_id != step->field) continue; - } - // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. + if (step->field && field_id != step->field) continue; uint32_t capture_list_id = capture_list_pool_acquire( &self->capture_list_pool ); @@ -1039,19 +1035,9 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { PatternEntry *slice = &self->query->pattern_map.contents[i]; QueryStep *step = &self->query->steps.contents[slice->step_index]; do { - if (step->field) { - // Compute the current field id if it is needed and has not yet - // been computed. - if (field_id == NONE) { - field_id = ts_tree_cursor_current_field_id_ext( - &self->cursor, - &field_occurs_in_later_sibling - ); - } - if (field_id != step->field) continue; - } + if (step->field && field_id != step->field) continue; - LOG("start pattern %u\n", slice->pattern_index); + LOG(" start state. pattern:%u\n", slice->pattern_index); // If this node matches the first step of the pattern, then add a // new in-progress state. First, acquire a list to hold the pattern's @@ -1059,7 +1045,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { uint32_t capture_list_id = capture_list_pool_acquire( &self->capture_list_pool ); - if (capture_list_id == NONE) break; + if (capture_list_id == NONE) { + LOG(" too many states."); + break; + } array_push(&self->states, ((QueryState) { .pattern_index = slice->pattern_index, @@ -1084,19 +1073,40 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { QueryStep *step = &self->query->steps.contents[state->step_index]; // Check that the node matches all of the criteria for the next - // step of the pattern. + // step of the pattern.if ( if (state->start_depth + step->depth != self->depth) continue; - if (step->symbol && step->symbol != symbol) continue; + + // Determine if this node matches this step of the pattern, and also + // if this node can have later siblings that match this step of the + // pattern. + bool node_does_match = !step->symbol || step->symbol == symbol; + bool later_sibling_can_match = can_have_later_siblings; if (step->field) { - // Compute the current field id if it is needed and has not yet - // been computed. - if (field_id == NONE) { - field_id = ts_tree_cursor_current_field_id_ext( - &self->cursor, - &field_occurs_in_later_sibling - ); + if (step->field == field_id) { + if (!node_does_match && !can_have_later_siblings_with_this_field) { + later_sibling_can_match = false; + } + } else { + node_does_match = false; } - if (field_id != step->field) continue; + } + + if (!node_does_match) { + if (!later_sibling_can_match) { + LOG( + " discard state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->states, i); + i--; + n--; + } + continue; } // Some patterns can match their root node in multiple ways, @@ -1107,17 +1117,30 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // this node, to preserve the possibility of capturing later // siblings. QueryState *next_state = state; - if (step->depth > 0 && (!step->field || field_occurs_in_later_sibling)) { + if (step->depth > 0 && later_sibling_can_match) { + LOG( + " split state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); QueryState *copy = ts_query_cursor_copy_state(self, state); if (copy) next_state = copy; } - LOG("advance state for pattern %u\n", next_state->pattern_index); + LOG( + " advance state. pattern:%u, step:%u\n", + next_state->pattern_index, + next_state->step_index + ); // If the current node is captured in this pattern, add it to the // capture list. if (step->capture_id != NONE) { - LOG("capture id %u\n", step->capture_id); + LOG( + " capture node. pattern:%u, capture_id:%u\n", + next_state->pattern_index, + step->capture_id + ); TSQueryCapture *capture_list = capture_list_pool_get( &self->capture_list_pool, next_state->capture_list_id diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 2ba3f947..265b460b 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -244,13 +244,74 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { ); } -static inline TSFieldId ts_tree_cursor__current_field_info( +TSFieldId ts_tree_cursor_current_status( const TSTreeCursor *_self, - const TSFieldMapEntry **field_map, - const TSFieldMapEntry **field_map_end, - uint32_t *child_index + bool *can_have_later_siblings, + bool *can_have_later_siblings_with_this_field ) { const TreeCursor *self = (const TreeCursor *)_self; + TSFieldId result = 0; + *can_have_later_siblings = false; + *can_have_later_siblings_with_this_field = false; + + // Walk up the tree, visiting the current node and its invisible ancestors, + // because fields can refer to nodes through invisible *wrapper* nodes, + for (unsigned i = self->stack.size - 1; i > 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + + // Stop walking up when a visible ancestor is found. + if (i != self->stack.size - 1) { + if (ts_subtree_visible(*entry->subtree)) break; + const TSSymbol *alias_sequence = ts_language_alias_sequence( + self->tree->language, + parent_entry->subtree->ptr->production_id + ); + if (alias_sequence && alias_sequence[entry->structural_child_index]) { + break; + } + } + + if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index) { + *can_have_later_siblings = true; + } + + if (ts_subtree_extra(*entry->subtree)) break; + + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self->tree->language, + parent_entry->subtree->ptr->production_id, + &field_map, &field_map_end + ); + + // Look for a field name associated with the current node. + if (!result) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (!i->inherited && i->child_index == entry->structural_child_index) { + result = i->field_id; + *can_have_later_siblings_with_this_field = false; + break; + } + } + } + + // Determine if there other later siblings with the same field name. + if (result) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (i->field_id == result && i->child_index > entry->structural_child_index) { + *can_have_later_siblings_with_this_field = true; + break; + } + } + } + } + + return result; +} + +TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; // Walk up the tree, visiting the current node and its invisible ancestors. for (unsigned i = self->stack.size - 1; i > 0; i--) { @@ -271,14 +332,14 @@ static inline TSFieldId ts_tree_cursor__current_field_info( if (ts_subtree_extra(*entry->subtree)) break; + const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( self->tree->language, parent_entry->subtree->ptr->production_id, - field_map, field_map_end + &field_map, &field_map_end ); - for (const TSFieldMapEntry *i = *field_map; i < *field_map_end; i++) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if (!i->inherited && i->child_index == entry->structural_child_index) { - *child_index = entry->structural_child_index; return i->field_id; } } @@ -286,44 +347,6 @@ static inline TSFieldId ts_tree_cursor__current_field_info( return 0; } -TSFieldId ts_tree_cursor_current_field_id_ext( - const TSTreeCursor *self, - bool *field_has_additional -) { - uint32_t child_index; - const TSFieldMapEntry *field_map, *field_map_end; - TSFieldId field_id = ts_tree_cursor__current_field_info( - self, - &field_map, - &field_map_end, - &child_index - ); - - // After finding the field, check if any other later children have - // the same field name. - if (field_id) { - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (i->field_id == field_id && i->child_index > child_index) { - *field_has_additional = true; - } - } - } - - return field_id; -} - - -TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *self) { - uint32_t child_index; - const TSFieldMapEntry *field_map, *field_map_end; - return ts_tree_cursor__current_field_info( - self, - &field_map, - &field_map_end, - &child_index - ); -} - const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) { TSFieldId id = ts_tree_cursor_current_field_id(_self); if (id) { diff --git a/lib/src/tree_cursor.h b/lib/src/tree_cursor.h index 9b438843..5a39dd27 100644 --- a/lib/src/tree_cursor.h +++ b/lib/src/tree_cursor.h @@ -16,6 +16,6 @@ typedef struct { } TreeCursor; void ts_tree_cursor_init(TreeCursor *, TSNode); -TSFieldId ts_tree_cursor_current_field_id_ext(const TSTreeCursor *, bool *); +TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *); #endif // TREE_SITTER_TREE_CURSOR_H_