Merge pull request #965 from tree-sitter/query-fixes

Fix problems found in investigating dropped query matches
This commit is contained in:
Max Brunsfeld 2021-03-05 15:26:58 -08:00 committed by GitHub
commit 5a964b1b70
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 210 additions and 31 deletions

View file

@ -100,6 +100,7 @@ impl<'a> ParseTableBuilder<'a> {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
has_preceding_inherited_fields: false,
},
&[Symbol::end_of_nonterminal_extra()]
.iter()
@ -197,16 +198,33 @@ impl<'a> ParseTableBuilder<'a> {
// next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set.
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
let mut successor = item.successor();
if next_symbol.is_non_terminal() {
let variable = &self.syntax_grammar.variables[next_symbol.index];
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
if variable.is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
// For most parse items, the symbols associated with the preceding children
// don't matter: they have no effect on the REDUCE action that would be
// performed at the end of the item. But the symbols *do* matter for
// children that are hidden and have fields, because those fields are
// "inherited" by the parent node.
//
// If this item has consumed a hidden child with fields, then the symbols
// of its preceding children need to be taken into account when comparing
// it with other items.
if variable.is_hidden()
&& !self.variable_info[next_symbol.index].fields.is_empty()
{
successor.has_preceding_inherited_fields = true;
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())

View file

@ -22,18 +22,42 @@ lazy_static! {
};
}
/// A ParseItem represents an in-progress match of a single production in a grammar.
#[derive(Clone, Copy, Debug)]
pub(crate) struct ParseItem<'a> {
/// The index of the parent rule within the grammar.
pub variable_index: u32,
/// The number of symbols that have already been matched.
pub step_index: u32,
/// The production being matched.
pub production: &'a Production,
/// A boolean indicating whether any of the already-matched children were
/// hidden nodes and had fields. Ordinarily, a parse item's behavior is not
/// affected by the symbols of its preceding children; it only needs to
/// keep track of their fields and aliases.
///
/// Take for example these two items:
/// X -> a b • c
/// X -> a g • c
///
/// They can be considered equivalent, for the purposes of parse table
/// generation, because they entail the same actions. But if this flag is
/// true, then the item's set of inherited fields may depend on the specific
/// symbols of its preceding children.
pub has_preceding_inherited_fields: bool,
}
/// A ParseItemSet represents a set of in-progress matches of productions in a
/// grammar, and for each in-progress match, a set of "lookaheads" - tokens that
/// are allowed to *follow* the in-progress rule. This object corresponds directly
/// to a state in the final parse table.
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSet<'a> {
pub entries: Vec<(ParseItem<'a>, TokenSet)>,
}
/// A ParseItemSetCore is like a ParseItemSet, but without the lookahead
/// information. Parse states with the same core are candidates for merging.
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSetCore<'a> {
pub entries: Vec<ParseItem<'a>>,
@ -64,6 +88,7 @@ impl<'a> ParseItem<'a> {
variable_index: u32::MAX,
production: &START_PRODUCTION,
step_index: 0,
has_preceding_inherited_fields: false,
}
}
@ -100,13 +125,23 @@ impl<'a> ParseItem<'a> {
self.variable_index == u32::MAX
}
/// Create an item like this one, but advanced by one step.
pub fn successor(&self) -> ParseItem<'a> {
ParseItem {
variable_index: self.variable_index,
production: self.production,
step_index: self.step_index + 1,
has_preceding_inherited_fields: self.has_preceding_inherited_fields,
}
}
/// Create an item identical to this one, but with a different production.
/// This is used when dynamically "inlining" certain symbols in a production.
pub fn substitute_production(&self, production: &'a Production) -> ParseItem<'a> {
let mut result = self.clone();
result.production = production;
result
}
}
impl<'a> ParseItemSet<'a> {
@ -258,19 +293,18 @@ impl<'a> Hash for ParseItem<'a> {
self.precedence().hash(hasher);
self.associativity().hash(hasher);
// When comparing two parse items, the symbols that were already consumed by
// both items don't matter. Take for example these two items:
// X -> a b • c
// X -> a d • c
// These two items can be considered equivalent, for the purposes of parse
// table generation, because they entail the same actions. However, if the
// productions have different aliases or field names, they *cannot* be
// treated as equivalent, because those details are ultimately stored as
// attributes of the `REDUCE` action that will be performed when the item
// is finished.
// The already-matched children don't play any role in the parse state for
// this item, unless any of the following are true:
// * the children have fields
// * the children have aliases
// * the children are hidden and
// See the docs for `has_preceding_inherited_fields`.
for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher);
step.field_name.hash(hasher);
if self.has_preceding_inherited_fields {
step.symbol.hash(hasher);
}
}
for step in &self.production.steps[self.step_index as usize..] {
step.hash(hasher);
@ -286,6 +320,7 @@ impl<'a> PartialEq for ParseItem<'a> {
|| self.production.steps.len() != other.production.steps.len()
|| self.precedence() != other.precedence()
|| self.associativity() != other.associativity()
|| self.has_preceding_inherited_fields != other.has_preceding_inherited_fields
{
return false;
}
@ -300,6 +335,11 @@ impl<'a> PartialEq for ParseItem<'a> {
if step.field_name != other.production.steps[i].field_name {
return false;
}
if self.has_preceding_inherited_fields
&& step.symbol != other.production.steps[i].symbol
{
return false;
}
} else if *step != other.production.steps[i] {
return false;
}

View file

@ -204,6 +204,7 @@ impl<'a> ParseItemSetBuilder<'a> {
variable_index,
production,
step_index: 0,
has_preceding_inherited_fields: false,
};
if let Some(inlined_productions) =
@ -213,11 +214,7 @@ impl<'a> ParseItemSetBuilder<'a> {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item: ParseItem {
variable_index,
production,
step_index: item.step_index,
},
item: item.substitute_production(production),
info: follow_set_info.clone(),
},
);
@ -248,11 +245,7 @@ impl<'a> ParseItemSetBuilder<'a> {
for production in productions {
self.add_item(
&mut result,
ParseItem {
variable_index: item.variable_index,
production,
step_index: item.step_index,
},
item.substitute_production(production),
lookaheads,
);
}

View file

@ -238,6 +238,10 @@ impl SyntaxVariable {
pub fn is_auxiliary(&self) -> bool {
self.kind == VariableType::Auxiliary
}
pub fn is_hidden(&self) -> bool {
self.kind == VariableType::Hidden || self.kind == VariableType::Auxiliary
}
}
impl InlinedProductionMap {

View file

@ -1425,6 +1425,7 @@ fn test_query_matches_with_too_many_permutations_to_track() {
collect_matches(matches, &query, source.as_str())[0],
(0, vec![("pre", "hello"), ("post", "hello")]),
);
assert_eq!(cursor.did_exceed_match_limit(), true);
});
}
@ -1462,6 +1463,7 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
collect_matches(matches, &query, source.as_str()),
vec![(1, vec![("method", "b")]); 50],
);
assert_eq!(cursor.did_exceed_match_limit(), true);
});
}
@ -1850,7 +1852,7 @@ fn test_query_matches_with_repeated_fields() {
"
struct S {
int a, b, c;
}
};
",
&[
(0, vec![("field", "a")]),
@ -1861,6 +1863,94 @@ fn test_query_matches_with_repeated_fields() {
});
}
#[test]
fn test_query_matches_with_deeply_nested_patterns_with_fields() {
allocations::record(|| {
let language = get_language("python");
let query = Query::new(
language,
"
(call
function: (_) @func
arguments: (_) @args)
(call
function: (attribute
object: (_) @receiver
attribute: (identifier) @method)
arguments: (argument_list))
; These don't match anything, but they require additional
; states to keep track of their captures.
(call
function: (_) @fn
arguments: (argument_list
(keyword_argument
name: (identifier) @name
value: (_) @val) @arg) @args) @call
(call
function: (identifier) @fn
(#eq? @fn \"super\")) @super_call
",
)
.unwrap();
assert_query_matches(
language,
&query,
"
a(1).b(2).c(3).d(4).e(5).f(6).g(7).h(8)
",
&[
(0, vec![("func", "a"), ("args", "(1)")]),
(0, vec![("func", "a(1).b"), ("args", "(2)")]),
(1, vec![("receiver", "a(1)"), ("method", "b")]),
(0, vec![("func", "a(1).b(2).c"), ("args", "(3)")]),
(1, vec![("receiver", "a(1).b(2)"), ("method", "c")]),
(0, vec![("func", "a(1).b(2).c(3).d"), ("args", "(4)")]),
(1, vec![("receiver", "a(1).b(2).c(3)"), ("method", "d")]),
(0, vec![("func", "a(1).b(2).c(3).d(4).e"), ("args", "(5)")]),
(
1,
vec![("receiver", "a(1).b(2).c(3).d(4)"), ("method", "e")],
),
(
0,
vec![("func", "a(1).b(2).c(3).d(4).e(5).f"), ("args", "(6)")],
),
(
1,
vec![("receiver", "a(1).b(2).c(3).d(4).e(5)"), ("method", "f")],
),
(
0,
vec![("func", "a(1).b(2).c(3).d(4).e(5).f(6).g"), ("args", "(7)")],
),
(
1,
vec![
("receiver", "a(1).b(2).c(3).d(4).e(5).f(6)"),
("method", "g"),
],
),
(
0,
vec![
("func", "a(1).b(2).c(3).d(4).e(5).f(6).g(7).h"),
("args", "(8)"),
],
),
(
1,
vec![
("receiver", "a(1).b(2).c(3).d(4).e(5).f(6).g(7)"),
("method", "h"),
],
),
],
);
});
}
#[test]
fn test_query_matches_with_indefinite_step_containing_no_captures() {
allocations::record(|| {
@ -3031,6 +3121,7 @@ fn assert_query_matches(
let mut cursor = QueryCursor::new();
let matches = cursor.matches(&query, tree.root_node(), to_callback(source));
assert_eq!(collect_matches(matches, &query, source), expected);
assert_eq!(cursor.did_exceed_match_limit(), false);
}
fn collect_matches<'a>(

View file

@ -720,6 +720,16 @@ extern "C" {
#[doc = " Start running a given query on a given node."]
pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode);
}
extern "C" {
#[doc = " Check if this cursor has exceeded its maximum number of in-progress"]
#[doc = " matches."]
#[doc = ""]
#[doc = " Currently, query cursors have a fixed capacity for storing lists"]
#[doc = " of in-progress captures. If this capacity is exceeded, then the"]
#[doc = " earliest-starting match will silently be dropped to make room for"]
#[doc = " further matches."]
pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool;
}
extern "C" {
#[doc = " Set the range of bytes or (row, column) positions in which the query"]
#[doc = " will be executed."]

View file

@ -1595,6 +1595,12 @@ impl QueryCursor {
QueryCursor(unsafe { NonNull::new_unchecked(ffi::ts_query_cursor_new()) })
}
/// Check if, on its last execution, this cursor exceeded its maximum number of
/// in-progress matches.
pub fn did_exceed_match_limit(&self) -> bool {
unsafe { ffi::ts_query_cursor_did_exceed_match_limit(self.0.as_ptr()) }
}
/// Iterate over all of the matches in the order that they were found.
///
/// Each match contains the index of the pattern that matched, and a list of captures.

View file

@ -791,6 +791,17 @@ void ts_query_cursor_delete(TSQueryCursor *);
*/
void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);
/**
* Check if this cursor has exceeded its maximum number of in-progress
* matches.
*
* Currently, query cursors have a fixed capacity for storing lists
* of in-progress captures. If this capacity is exceeded, then the
* earliest-starting match will silently be dropped to make room for
* further matches.
*/
bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *);
/**
* Set the range of bytes or (row, column) positions in which the query
* will be executed.

View file

@ -236,6 +236,7 @@ struct TSQueryCursor {
TSPoint end_point;
bool ascending;
bool halted;
bool did_exceed_match_limit;
};
static const TSQueryError PARENT_DONE = -1;
@ -2103,6 +2104,7 @@ void ts_query_disable_pattern(
TSQueryCursor *ts_query_cursor_new(void) {
TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
*self = (TSQueryCursor) {
.did_exceed_match_limit = false,
.ascending = false,
.halted = false,
.states = array_new(),
@ -2126,6 +2128,10 @@ void ts_query_cursor_delete(TSQueryCursor *self) {
ts_free(self);
}
bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
return self->did_exceed_match_limit;
}
void ts_query_cursor_exec(
TSQueryCursor *self,
const TSQuery *query,
@ -2140,6 +2146,7 @@ void ts_query_cursor_exec(
self->ascending = false;
self->halted = false;
self->query = query;
self->did_exceed_match_limit = false;
}
void ts_query_cursor_set_byte_range(
@ -2359,6 +2366,7 @@ static CaptureList *ts_query_cursor__prepare_to_capture(
// state has captured the earliest node in the document, and steal its
// capture list.
if (state->capture_list_id == NONE) {
self->did_exceed_match_limit = true;
uint32_t state_index, byte_offset, pattern_index;
if (
ts_query_cursor__first_in_progress_capture(

View file

@ -353,14 +353,12 @@ void ts_tree_cursor_current_status(
// Determine if the current node can have later siblings with the same field name.
if (*field_id) {
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (i->field_id == *field_id) {
if (
i->child_index > entry->structural_child_index ||
(i->child_index == entry->structural_child_index && *has_later_named_siblings)
) {
*can_have_later_siblings_with_this_field = true;
break;
}
if (
i->field_id == *field_id &&
i->child_index > entry->structural_child_index
) {
*can_have_later_siblings_with_this_field = true;
break;
}
}
}