diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 3b4c8cd8..2231d5b0 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -100,6 +100,7 @@ impl<'a> ParseTableBuilder<'a> { variable_index: extra_non_terminal.index as u32, production, step_index: 1, + has_preceding_inherited_fields: false, }, &[Symbol::end_of_nonterminal_extra()] .iter() @@ -197,16 +198,33 @@ impl<'a> ParseTableBuilder<'a> { // next symbol. Advance the item to its next step and insert the resulting // item into the successor item set. if let Some(next_symbol) = item.symbol() { - let successor = item.successor(); + let mut successor = item.successor(); if next_symbol.is_non_terminal() { + let variable = &self.syntax_grammar.variables[next_symbol.index]; + // Keep track of where auxiliary non-terminals (repeat symbols) are // used within visible symbols. This information may be needed later // for conflict resolution. - if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + if variable.is_auxiliary() { preceding_auxiliary_symbols .push(self.get_auxiliary_node_info(&item_set, next_symbol)); } + // For most parse items, the symbols associated with the preceding children + // don't matter: they have no effect on the REDUCE action that would be + // performed at the end of the item. But the symbols *do* matter for + // children that are hidden and have fields, because those fields are + // "inherited" by the parent node. + // + // If this item has consumed a hidden child with fields, then the symbols + // of its preceding children need to be taken into account when comparing + // it with other items. + if variable.is_hidden() + && !self.variable_info[next_symbol.index].fields.is_empty() + { + successor.has_preceding_inherited_fields = true; + } + non_terminal_successors .entry(next_symbol) .or_insert_with(|| ParseItemSet::default()) diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index e3205cf8..8917f5c5 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -22,18 +22,42 @@ lazy_static! { }; } +/// A ParseItem represents an in-progress match of a single production in a grammar. #[derive(Clone, Copy, Debug)] pub(crate) struct ParseItem<'a> { + /// The index of the parent rule within the grammar. pub variable_index: u32, + /// The number of symbols that have already been matched. pub step_index: u32, + /// The production being matched. pub production: &'a Production, + /// A boolean indicating whether any of the already-matched children were + /// hidden nodes and had fields. Ordinarily, a parse item's behavior is not + /// affected by the symbols of its preceding children; it only needs to + /// keep track of their fields and aliases. + /// + /// Take for example these two items: + /// X -> a b • c + /// X -> a g • c + /// + /// They can be considered equivalent, for the purposes of parse table + /// generation, because they entail the same actions. But if this flag is + /// true, then the item's set of inherited fields may depend on the specific + /// symbols of its preceding children. + pub has_preceding_inherited_fields: bool, } +/// A ParseItemSet represents a set of in-progress matches of productions in a +/// grammar, and for each in-progress match, a set of "lookaheads" - tokens that +/// are allowed to *follow* the in-progress rule. This object corresponds directly +/// to a state in the final parse table. #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet<'a> { pub entries: Vec<(ParseItem<'a>, TokenSet)>, } +/// A ParseItemSetCore is like a ParseItemSet, but without the lookahead +/// information. Parse states with the same core are candidates for merging. #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSetCore<'a> { pub entries: Vec>, @@ -64,6 +88,7 @@ impl<'a> ParseItem<'a> { variable_index: u32::MAX, production: &START_PRODUCTION, step_index: 0, + has_preceding_inherited_fields: false, } } @@ -100,13 +125,23 @@ impl<'a> ParseItem<'a> { self.variable_index == u32::MAX } + /// Create an item like this one, but advanced by one step. pub fn successor(&self) -> ParseItem<'a> { ParseItem { variable_index: self.variable_index, production: self.production, step_index: self.step_index + 1, + has_preceding_inherited_fields: self.has_preceding_inherited_fields, } } + + /// Create an item identical to this one, but with a different production. + /// This is used when dynamically "inlining" certain symbols in a production. + pub fn substitute_production(&self, production: &'a Production) -> ParseItem<'a> { + let mut result = self.clone(); + result.production = production; + result + } } impl<'a> ParseItemSet<'a> { @@ -258,19 +293,18 @@ impl<'a> Hash for ParseItem<'a> { self.precedence().hash(hasher); self.associativity().hash(hasher); - // When comparing two parse items, the symbols that were already consumed by - // both items don't matter. Take for example these two items: - // X -> a b • c - // X -> a d • c - // These two items can be considered equivalent, for the purposes of parse - // table generation, because they entail the same actions. However, if the - // productions have different aliases or field names, they *cannot* be - // treated as equivalent, because those details are ultimately stored as - // attributes of the `REDUCE` action that will be performed when the item - // is finished. + // The already-matched children don't play any role in the parse state for + // this item, unless any of the following are true: + // * the children have fields + // * the children have aliases + // * the children are hidden and + // See the docs for `has_preceding_inherited_fields`. for step in &self.production.steps[0..self.step_index as usize] { step.alias.hash(hasher); step.field_name.hash(hasher); + if self.has_preceding_inherited_fields { + step.symbol.hash(hasher); + } } for step in &self.production.steps[self.step_index as usize..] { step.hash(hasher); @@ -286,6 +320,7 @@ impl<'a> PartialEq for ParseItem<'a> { || self.production.steps.len() != other.production.steps.len() || self.precedence() != other.precedence() || self.associativity() != other.associativity() + || self.has_preceding_inherited_fields != other.has_preceding_inherited_fields { return false; } @@ -300,6 +335,11 @@ impl<'a> PartialEq for ParseItem<'a> { if step.field_name != other.production.steps[i].field_name { return false; } + if self.has_preceding_inherited_fields + && step.symbol != other.production.steps[i].symbol + { + return false; + } } else if *step != other.production.steps[i] { return false; } diff --git a/cli/src/generate/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs index 29690829..18283576 100644 --- a/cli/src/generate/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -204,6 +204,7 @@ impl<'a> ParseItemSetBuilder<'a> { variable_index, production, step_index: 0, + has_preceding_inherited_fields: false, }; if let Some(inlined_productions) = @@ -213,11 +214,7 @@ impl<'a> ParseItemSetBuilder<'a> { find_or_push( additions_for_non_terminal, TransitiveClosureAddition { - item: ParseItem { - variable_index, - production, - step_index: item.step_index, - }, + item: item.substitute_production(production), info: follow_set_info.clone(), }, ); @@ -248,11 +245,7 @@ impl<'a> ParseItemSetBuilder<'a> { for production in productions { self.add_item( &mut result, - ParseItem { - variable_index: item.variable_index, - production, - step_index: item.step_index, - }, + item.substitute_production(production), lookaheads, ); } diff --git a/cli/src/generate/grammars.rs b/cli/src/generate/grammars.rs index 33a5dcb4..08ce8da8 100644 --- a/cli/src/generate/grammars.rs +++ b/cli/src/generate/grammars.rs @@ -238,6 +238,10 @@ impl SyntaxVariable { pub fn is_auxiliary(&self) -> bool { self.kind == VariableType::Auxiliary } + + pub fn is_hidden(&self) -> bool { + self.kind == VariableType::Hidden || self.kind == VariableType::Auxiliary + } } impl InlinedProductionMap { diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 8a44af3c..f41cdf3f 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1425,6 +1425,7 @@ fn test_query_matches_with_too_many_permutations_to_track() { collect_matches(matches, &query, source.as_str())[0], (0, vec![("pre", "hello"), ("post", "hello")]), ); + assert_eq!(cursor.did_exceed_match_limit(), true); }); } @@ -1462,6 +1463,7 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() { collect_matches(matches, &query, source.as_str()), vec![(1, vec![("method", "b")]); 50], ); + assert_eq!(cursor.did_exceed_match_limit(), true); }); } @@ -1850,7 +1852,7 @@ fn test_query_matches_with_repeated_fields() { " struct S { int a, b, c; - } + }; ", &[ (0, vec![("field", "a")]), @@ -1861,6 +1863,94 @@ fn test_query_matches_with_repeated_fields() { }); } +#[test] +fn test_query_matches_with_deeply_nested_patterns_with_fields() { + allocations::record(|| { + let language = get_language("python"); + let query = Query::new( + language, + " + (call + function: (_) @func + arguments: (_) @args) + (call + function: (attribute + object: (_) @receiver + attribute: (identifier) @method) + arguments: (argument_list)) + + ; These don't match anything, but they require additional + ; states to keep track of their captures. + (call + function: (_) @fn + arguments: (argument_list + (keyword_argument + name: (identifier) @name + value: (_) @val) @arg) @args) @call + (call + function: (identifier) @fn + (#eq? @fn \"super\")) @super_call + ", + ) + .unwrap(); + + assert_query_matches( + language, + &query, + " + a(1).b(2).c(3).d(4).e(5).f(6).g(7).h(8) + ", + &[ + (0, vec![("func", "a"), ("args", "(1)")]), + (0, vec![("func", "a(1).b"), ("args", "(2)")]), + (1, vec![("receiver", "a(1)"), ("method", "b")]), + (0, vec![("func", "a(1).b(2).c"), ("args", "(3)")]), + (1, vec![("receiver", "a(1).b(2)"), ("method", "c")]), + (0, vec![("func", "a(1).b(2).c(3).d"), ("args", "(4)")]), + (1, vec![("receiver", "a(1).b(2).c(3)"), ("method", "d")]), + (0, vec![("func", "a(1).b(2).c(3).d(4).e"), ("args", "(5)")]), + ( + 1, + vec![("receiver", "a(1).b(2).c(3).d(4)"), ("method", "e")], + ), + ( + 0, + vec![("func", "a(1).b(2).c(3).d(4).e(5).f"), ("args", "(6)")], + ), + ( + 1, + vec![("receiver", "a(1).b(2).c(3).d(4).e(5)"), ("method", "f")], + ), + ( + 0, + vec![("func", "a(1).b(2).c(3).d(4).e(5).f(6).g"), ("args", "(7)")], + ), + ( + 1, + vec![ + ("receiver", "a(1).b(2).c(3).d(4).e(5).f(6)"), + ("method", "g"), + ], + ), + ( + 0, + vec![ + ("func", "a(1).b(2).c(3).d(4).e(5).f(6).g(7).h"), + ("args", "(8)"), + ], + ), + ( + 1, + vec![ + ("receiver", "a(1).b(2).c(3).d(4).e(5).f(6).g(7)"), + ("method", "h"), + ], + ), + ], + ); + }); +} + #[test] fn test_query_matches_with_indefinite_step_containing_no_captures() { allocations::record(|| { @@ -3031,6 +3121,7 @@ fn assert_query_matches( let mut cursor = QueryCursor::new(); let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); assert_eq!(collect_matches(matches, &query, source), expected); + assert_eq!(cursor.did_exceed_match_limit(), false); } fn collect_matches<'a>( diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index 154ef826..a749ef98 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -720,6 +720,16 @@ extern "C" { #[doc = " Start running a given query on a given node."] pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode); } +extern "C" { + #[doc = " Check if this cursor has exceeded its maximum number of in-progress"] + #[doc = " matches."] + #[doc = ""] + #[doc = " Currently, query cursors have a fixed capacity for storing lists"] + #[doc = " of in-progress captures. If this capacity is exceeded, then the"] + #[doc = " earliest-starting match will silently be dropped to make room for"] + #[doc = " further matches."] + pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool; +} extern "C" { #[doc = " Set the range of bytes or (row, column) positions in which the query"] #[doc = " will be executed."] diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 62a67f33..f7422c67 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -1595,6 +1595,12 @@ impl QueryCursor { QueryCursor(unsafe { NonNull::new_unchecked(ffi::ts_query_cursor_new()) }) } + /// Check if, on its last execution, this cursor exceeded its maximum number of + /// in-progress matches. + pub fn did_exceed_match_limit(&self) -> bool { + unsafe { ffi::ts_query_cursor_did_exceed_match_limit(self.0.as_ptr()) } + } + /// Iterate over all of the matches in the order that they were found. /// /// Each match contains the index of the pattern that matched, and a list of captures. diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index 3299fd20..22837035 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -791,6 +791,17 @@ void ts_query_cursor_delete(TSQueryCursor *); */ void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode); +/** + * Check if this cursor has exceeded its maximum number of in-progress + * matches. + * + * Currently, query cursors have a fixed capacity for storing lists + * of in-progress captures. If this capacity is exceeded, then the + * earliest-starting match will silently be dropped to make room for + * further matches. + */ +bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *); + /** * Set the range of bytes or (row, column) positions in which the query * will be executed. diff --git a/lib/src/query.c b/lib/src/query.c index d28e1b92..5a20603d 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -236,6 +236,7 @@ struct TSQueryCursor { TSPoint end_point; bool ascending; bool halted; + bool did_exceed_match_limit; }; static const TSQueryError PARENT_DONE = -1; @@ -2103,6 +2104,7 @@ void ts_query_disable_pattern( TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { + .did_exceed_match_limit = false, .ascending = false, .halted = false, .states = array_new(), @@ -2126,6 +2128,10 @@ void ts_query_cursor_delete(TSQueryCursor *self) { ts_free(self); } +bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) { + return self->did_exceed_match_limit; +} + void ts_query_cursor_exec( TSQueryCursor *self, const TSQuery *query, @@ -2140,6 +2146,7 @@ void ts_query_cursor_exec( self->ascending = false; self->halted = false; self->query = query; + self->did_exceed_match_limit = false; } void ts_query_cursor_set_byte_range( @@ -2359,6 +2366,7 @@ static CaptureList *ts_query_cursor__prepare_to_capture( // state has captured the earliest node in the document, and steal its // capture list. if (state->capture_list_id == NONE) { + self->did_exceed_match_limit = true; uint32_t state_index, byte_offset, pattern_index; if ( ts_query_cursor__first_in_progress_capture( diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 4c6f1b82..c4ee7a90 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -353,14 +353,12 @@ void ts_tree_cursor_current_status( // Determine if the current node can have later siblings with the same field name. if (*field_id) { for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (i->field_id == *field_id) { - if ( - i->child_index > entry->structural_child_index || - (i->child_index == entry->structural_child_index && *has_later_named_siblings) - ) { - *can_have_later_siblings_with_this_field = true; - break; - } + if ( + i->field_id == *field_id && + i->child_index > entry->structural_child_index + ) { + *can_have_later_siblings_with_this_field = true; + break; } } }