Fix handling of queries with many patterns with leading repetitions

This commit is contained in:
Max Brunsfeld 2020-07-31 12:47:06 -07:00
parent f265e63d48
commit af655547e5
2 changed files with 135 additions and 33 deletions

View file

@ -382,7 +382,7 @@ fn test_query_matches_with_many_overlapping_results() {
)
.unwrap();
let count = 80;
let count = 1024;
// Deeply nested chained function calls:
// a
@ -547,8 +547,8 @@ fn test_query_matches_with_immediate_siblings() {
&[
(0, vec![("parent", "a"), ("child", "b")]),
(0, vec![("parent", "b"), ("child", "c")]),
(1, vec![("last-child", "d")]),
(0, vec![("parent", "c"), ("child", "d")]),
(1, vec![("last-child", "d")]),
(2, vec![("first-element", "w")]),
(2, vec![("first-element", "1")]),
],
@ -732,6 +732,55 @@ fn test_query_matches_with_nested_repetitions() {
});
}
#[test]
fn test_query_matches_with_multiple_repetition_patterns_that_intersect_other_pattern() {
allocations::record(|| {
let language = get_language("javascript");
// When this query sees a comment, it must keep track of several potential
// matches: up to two for each pattern that begins with a comment.
let query = Query::new(
language,
r#"
(call_expression
function: (member_expression
property: (property_identifier) @name)) @ref.method
((comment)* @doc (function_declaration))
((comment)* @doc (generator_function_declaration))
((comment)* @doc (class_declaration))
((comment)* @doc (lexical_declaration))
((comment)* @doc (variable_declaration))
((comment)* @doc (method_definition))
(comment) @comment
"#,
)
.unwrap();
// Here, a series of comments occurs in the middle of a match of the first
// pattern. To avoid exceeding the storage limits and discarding that outer
// match, the comment-related matches need to be managed efficiently.
let source = format!(
"theObject\n{}\n.theMethod()",
" // the comment\n".repeat(64)
);
assert_query_matches(
language,
&query,
&source,
&vec![(7, vec![("comment", "// the comment")]); 64]
.into_iter()
.chain(vec![(
0,
vec![("ref.method", source.as_str()), ("name", "theMethod")],
)])
.collect::<Vec<_>>(),
);
});
}
#[test]
fn test_query_matches_with_leading_zero_or_more_repeated_leaf_nodes() {
allocations::record(|| {

View file

@ -48,7 +48,6 @@ typedef struct {
uint16_t alternative_index;
uint16_t depth;
bool contains_captures: 1;
bool is_pattern_start: 1;
bool is_immediate: 1;
bool is_last_child: 1;
bool is_pass_through: 1;
@ -449,7 +448,6 @@ static QueryStep query_step__new(
.alternative_index = NONE,
.contains_captures = false,
.is_last_child = false,
.is_pattern_start = false,
.is_pass_through = false,
.is_dead_end = false,
.is_immediate = is_immediate,
@ -547,6 +545,23 @@ static inline void ts_query__pattern_map_insert(
) {
uint32_t index;
ts_query__pattern_map_search(self, symbol, &index);
// Ensure that the entries are sorted not only by symbol, but also
// by pattern_index. This way, states for earlier patterns will be
// initiated first, which allows the ordering of the states array
// to be maintained more efficiently.
while (index < self->pattern_map.size) {
PatternEntry *entry = &self->pattern_map.contents[index];
if (
self->steps.contents[entry->step_index].symbol == symbol &&
entry->pattern_index < pattern_index
) {
index++;
} else {
break;
}
}
array_insert(&self->pattern_map, index, ((PatternEntry) {
.step_index = start_step_index,
.pattern_index = pattern_index,
@ -1168,7 +1183,6 @@ TSQuery *ts_query_new(
// Maintain a map that can look up patterns for a given root symbol.
for (;;) {
QueryStep *step = &self->steps.contents[start_step_index];
step->is_pattern_start = true;
ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index);
if (step->symbol == WILDCARD_SYMBOL) {
self->wildcard_root_pattern_count++;
@ -1178,6 +1192,7 @@ TSQuery *ts_query_new(
// then add multiple entries to the pattern map.
if (step->alternative_index != NONE) {
start_step_index = step->alternative_index;
step->alternative_index = NONE;
} else {
break;
}
@ -1460,27 +1475,62 @@ void ts_query_cursor__compare_captures(
}
}
static bool ts_query_cursor__add_state(
static void ts_query_cursor__add_state(
TSQueryCursor *self,
const PatternEntry *pattern
) {
QueryStep *step = &self->query->steps.contents[pattern->step_index];
uint32_t start_depth = self->depth - step->depth;
// Keep the states array in ascending order of start_depth and pattern_index,
// so that it can be processed more efficiently elsewhere. Usually, there is
// no work to do here because of two facts:
// * States with lower start_depth are naturally added first due to the
// order in which nodes are visited.
// * Earlier patterns are naturally added first because of the ordering of the
// pattern_map data structure that's used to initiate matches.
//
// This loop is only needed in cases where two conditions hold:
// * A pattern consists of more than one sibling node, so that its states
// remain in progress after exiting the node that started the match.
// * The first node in the pattern matches against multiple nodes at the
// same depth.
//
// An example of this is the pattern '((comment)* (function))'. If multiple
// `comment` nodes appear in a row, then we may initiate a new state for this
// pattern while another state for the same pattern is already in progress.
// If there are multiple patterns like this in a query, then this loop will
// need to execute in order to keep the states ordered by pattern_index.
uint32_t index = self->states.size;
while (index > 0) {
QueryState *prev_state = &self->states.contents[index - 1];
if (prev_state->start_depth < start_depth) break;
if (prev_state->start_depth == start_depth) {
if (prev_state->pattern_index < pattern->pattern_index) break;
if (prev_state->pattern_index == pattern->pattern_index) {
// Avoid unnecessarily inserting an unnecessary duplicate state,
// which would be immediately pruned by the longest-match criteria.
if (prev_state->step_index == pattern->step_index) return;
}
}
index--;
}
LOG(
" start state. pattern:%u, step:%u\n",
pattern->pattern_index,
pattern->step_index
);
QueryStep *step = &self->query->steps.contents[pattern->step_index];
array_push(&self->states, ((QueryState) {
array_insert(&self->states, index, ((QueryState) {
.capture_list_id = NONE,
.step_index = pattern->step_index,
.pattern_index = pattern->pattern_index,
.start_depth = self->depth - step->depth,
.start_depth = start_depth,
.consumed_capture_count = 0,
.seeking_immediate_match = false,
.seeking_immediate_match = true,
.has_in_progress_alternatives = false,
.dead = false,
}));
return true;
}
// Acquire a capture list for this state. If there are no capture lists left in the
@ -1682,7 +1732,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If this node matches the first step of the pattern, then add a new
// state at the start of this pattern.
if (step->field && field_id != step->field) continue;
if (!ts_query_cursor__add_state(self, pattern)) break;
ts_query_cursor__add_state(self, pattern);
}
// Add new states for any patterns whose root node matches this node.
@ -1694,7 +1744,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If this node matches the first step of the pattern, then add a new
// state at the start of this pattern.
if (step->field && field_id != step->field) continue;
if (!ts_query_cursor__add_state(self, pattern)) break;
ts_query_cursor__add_state(self, pattern);
// Advance to the next pattern whose root node matches this node.
i++;
@ -1762,11 +1812,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// parent, then this query state cannot simply be updated in place. It must be
// split into two states: one that matches this node, and one which skips over
// this node, to preserve the possibility of matching later siblings.
if (
later_sibling_can_match &&
!step->is_pattern_start &&
step->contains_captures
) {
if (later_sibling_can_match && step->contains_captures) {
if (ts_query_cursor__copy_state(self, &state)) {
LOG(
" split state for capture. pattern:%u, step:%u\n",
@ -1822,25 +1868,27 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
continue;
}
QueryState *copy = ts_query_cursor__copy_state(self, &state);
if (next_step->is_pass_through) {
state->step_index++;
j--;
}
QueryState *copy = ts_query_cursor__copy_state(self, &state);
if (copy) {
copy_count++;
LOG(
" split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n",
copy->pattern_index,
copy->step_index,
next_step->alternative_index,
next_step->alternative_is_immediate,
capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size
);
end_index++;
copy_count++;
copy->step_index = next_step->alternative_index;
if (next_step->alternative_is_immediate) {
copy->seeking_immediate_match = true;
}
LOG(
" split state for branch. pattern:%u, step:%u, step:%u, immediate:%d\n",
copy->pattern_index,
state->step_index,
copy->step_index,
copy->seeking_immediate_match
);
}
}
}
@ -1860,13 +1908,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
bool did_remove = false;
for (unsigned j = i + 1; j < self->states.size; j++) {
QueryState *other_state = &self->states.contents[j];
if (other_state->dead) {
array_erase(&self->states, j);
j--;
continue;
}
// When query states are copied in order
// Query states are kept in ascending order of start_depth and pattern_index.
// Since the longest-match criteria is only used for deduping matches of the same
// pattern and root node, we only need to perform pairwise comparisons within a
// small slice of the states array.
if (
other_state->start_depth != state->start_depth ||
other_state->pattern_index != state->pattern_index
@ -1914,6 +1960,13 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If there the state is at the end of its pattern, remove it from the list
// of in-progress states and add it to the list of finished states.
if (!did_remove) {
LOG(
" keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n",
state->pattern_index,
state->start_depth,
state->step_index,
capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size
);
QueryStep *next_step = &self->query->steps.contents[state->step_index];
if (next_step->depth == PATTERN_DONE_MARKER) {
if (state->has_in_progress_alternatives) {