Merge pull request #965 from tree-sitter/query-fixes

Fix problems found in investigating dropped query matches
2021-03-05 15:26:58 -08:00 · 2021-03-05 15:26:58 -08:00 · 5a964b1b70
commit 5a964b1b70
parent e20aff9a9c db6e1d9bdc
10 changed files with 210 additions and 31 deletions
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@ -100,6 +100,7 @@ impl<'a> ParseTableBuilder<'a> {
                            variable_index: extra_non_terminal.index as u32,
                            production,
                            step_index: 1,
+                            has_preceding_inherited_fields: false,
                        },
                        &[Symbol::end_of_nonterminal_extra()]
                            .iter()
@ -197,16 +198,33 @@ impl<'a> ParseTableBuilder<'a> {
            // next symbol. Advance the item to its next step and insert the resulting
            // item into the successor item set.
            if let Some(next_symbol) = item.symbol() {
-                let successor = item.successor();
+                let mut successor = item.successor();
                if next_symbol.is_non_terminal() {
+                    let variable = &self.syntax_grammar.variables[next_symbol.index];
+
                    // Keep track of where auxiliary non-terminals (repeat symbols) are
                    // used within visible symbols. This information may be needed later
                    // for conflict resolution.
-                    if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
+                    if variable.is_auxiliary() {
                        preceding_auxiliary_symbols
                            .push(self.get_auxiliary_node_info(&item_set, next_symbol));
                    }

+                    // For most parse items, the symbols associated with the preceding children
+                    // don't matter: they have no effect on the REDUCE action that would be
+                    // performed at the end of the item. But the symbols *do* matter for
+                    // children that are hidden and have fields, because those fields are
+                    // "inherited" by the parent node.
+                    //
+                    // If this item has consumed a hidden child with fields, then the symbols
+                    // of its preceding children need to be taken into account when comparing
+                    // it with other items.
+                    if variable.is_hidden()
+                        && !self.variable_info[next_symbol.index].fields.is_empty()
+                    {
+                        successor.has_preceding_inherited_fields = true;
+                    }
+
                    non_terminal_successors
                        .entry(next_symbol)
                        .or_insert_with(|| ParseItemSet::default())
--- a/cli/src/generate/build_tables/item.rs
+++ b/cli/src/generate/build_tables/item.rs
@ -22,18 +22,42 @@ lazy_static! {
    };
 }

+/// A ParseItem represents an in-progress match of a single production in a grammar.
 #[derive(Clone, Copy, Debug)]
 pub(crate) struct ParseItem<'a> {
+    /// The index of the parent rule within the grammar.
    pub variable_index: u32,
+    /// The number of symbols that have already been matched.
    pub step_index: u32,
+    /// The production being matched.
    pub production: &'a Production,
+    /// A boolean indicating whether any of the already-matched children were
+    /// hidden nodes and had fields. Ordinarily, a parse item's behavior is not
+    /// affected by the symbols of its preceding children; it only needs to
+    /// keep track of their fields and aliases.
+    ///
+    /// Take for example these two items:
+    ///   X -> a b • c
+    ///   X -> a g • c
+    ///
+    /// They can be considered equivalent, for the purposes of parse table
+    /// generation, because they entail the same actions. But if this flag is
+    /// true, then the item's set of inherited fields may depend on the specific
+    /// symbols of its preceding children.
+    pub has_preceding_inherited_fields: bool,
 }

+/// A ParseItemSet represents a set of in-progress matches of productions in a
+/// grammar, and for each in-progress match, a set of "lookaheads" - tokens that
+/// are allowed to *follow* the in-progress rule. This object corresponds directly
+/// to a state in the final parse table.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub(crate) struct ParseItemSet<'a> {
    pub entries: Vec<(ParseItem<'a>, TokenSet)>,
 }

+/// A ParseItemSetCore is like a ParseItemSet, but without the lookahead
+/// information. Parse states with the same core are candidates for merging.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub(crate) struct ParseItemSetCore<'a> {
    pub entries: Vec<ParseItem<'a>>,
@ -64,6 +88,7 @@ impl<'a> ParseItem<'a> {
            variable_index: u32::MAX,
            production: &START_PRODUCTION,
            step_index: 0,
+            has_preceding_inherited_fields: false,
        }
    }

@ -100,13 +125,23 @@ impl<'a> ParseItem<'a> {
        self.variable_index == u32::MAX
    }

+    /// Create an item like this one, but advanced by one step.
    pub fn successor(&self) -> ParseItem<'a> {
        ParseItem {
            variable_index: self.variable_index,
            production: self.production,
            step_index: self.step_index + 1,
+            has_preceding_inherited_fields: self.has_preceding_inherited_fields,
        }
    }
+
+    /// Create an item identical to this one, but with a different production.
+    /// This is used when dynamically "inlining" certain symbols in a production.
+    pub fn substitute_production(&self, production: &'a Production) -> ParseItem<'a> {
+        let mut result = self.clone();
+        result.production = production;
+        result
+    }
 }

 impl<'a> ParseItemSet<'a> {
@ -258,19 +293,18 @@ impl<'a> Hash for ParseItem<'a> {
        self.precedence().hash(hasher);
        self.associativity().hash(hasher);

-        // When comparing two parse items, the symbols that were already consumed by
-        // both items don't matter. Take for example these two items:
-        //   X -> a b • c
-        //   X -> a d • c
-        // These two items can be considered equivalent, for the purposes of parse
-        // table generation, because they entail the same actions. However, if the
-        // productions have different aliases or field names, they *cannot* be
-        // treated as equivalent, because those details are ultimately stored as
-        // attributes of the `REDUCE` action that will be performed when the item
-        // is finished.
+        // The already-matched children don't play any role in the parse state for
+        // this item, unless any of the following are true:
+        //   * the children have fields
+        //   * the children have aliases
+        //   * the children are hidden and
+        // See the docs for `has_preceding_inherited_fields`.
        for step in &self.production.steps[0..self.step_index as usize] {
            step.alias.hash(hasher);
            step.field_name.hash(hasher);
+            if self.has_preceding_inherited_fields {
+                step.symbol.hash(hasher);
+            }
        }
        for step in &self.production.steps[self.step_index as usize..] {
            step.hash(hasher);
@ -286,6 +320,7 @@ impl<'a> PartialEq for ParseItem<'a> {
            || self.production.steps.len() != other.production.steps.len()
            || self.precedence() != other.precedence()
            || self.associativity() != other.associativity()
+            || self.has_preceding_inherited_fields != other.has_preceding_inherited_fields
        {
            return false;
        }
@ -300,6 +335,11 @@ impl<'a> PartialEq for ParseItem<'a> {
                if step.field_name != other.production.steps[i].field_name {
                    return false;
                }
+                if self.has_preceding_inherited_fields
+                    && step.symbol != other.production.steps[i].symbol
+                {
+                    return false;
+                }
            } else if *step != other.production.steps[i] {
                return false;
            }
--- a/cli/src/generate/build_tables/item_set_builder.rs
+++ b/cli/src/generate/build_tables/item_set_builder.rs
@ -204,6 +204,7 @@ impl<'a> ParseItemSetBuilder<'a> {
                        variable_index,
                        production,
                        step_index: 0,
+                        has_preceding_inherited_fields: false,
                    };

                    if let Some(inlined_productions) =
@ -213,11 +214,7 @@ impl<'a> ParseItemSetBuilder<'a> {
                            find_or_push(
                                additions_for_non_terminal,
                                TransitiveClosureAddition {
-                                    item: ParseItem {
-                                        variable_index,
-                                        production,
-                                        step_index: item.step_index,
-                                    },
+                                    item: item.substitute_production(production),
                                    info: follow_set_info.clone(),
                                },
                            );
@ -248,11 +245,7 @@ impl<'a> ParseItemSetBuilder<'a> {
                for production in productions {
                    self.add_item(
                        &mut result,
-                        ParseItem {
-                            variable_index: item.variable_index,
-                            production,
-                            step_index: item.step_index,
-                        },
+                        item.substitute_production(production),
                        lookaheads,
                    );
                }
--- a/cli/src/generate/grammars.rs
+++ b/cli/src/generate/grammars.rs
@ -238,6 +238,10 @@ impl SyntaxVariable {
    pub fn is_auxiliary(&self) -> bool {
        self.kind == VariableType::Auxiliary
    }
+
+    pub fn is_hidden(&self) -> bool {
+        self.kind == VariableType::Hidden || self.kind == VariableType::Auxiliary
+    }
 }

 impl InlinedProductionMap {
--- a/cli/src/tests/query_test.rs
+++ b/cli/src/tests/query_test.rs
@ -1425,6 +1425,7 @@ fn test_query_matches_with_too_many_permutations_to_track() {
            collect_matches(matches, &query, source.as_str())[0],
            (0, vec![("pre", "hello"), ("post", "hello")]),
        );
+        assert_eq!(cursor.did_exceed_match_limit(), true);
    });
 }

@ -1462,6 +1463,7 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
            collect_matches(matches, &query, source.as_str()),
            vec![(1, vec![("method", "b")]); 50],
        );
+        assert_eq!(cursor.did_exceed_match_limit(), true);
    });
 }

@ -1850,7 +1852,7 @@ fn test_query_matches_with_repeated_fields() {
            "
            struct S {
                int a, b, c;
-            }
+            };
            ",
            &[
                (0, vec![("field", "a")]),
@ -1861,6 +1863,94 @@ fn test_query_matches_with_repeated_fields() {
    });
 }

+#[test]
+fn test_query_matches_with_deeply_nested_patterns_with_fields() {
+    allocations::record(|| {
+        let language = get_language("python");
+        let query = Query::new(
+            language,
+            "
+            (call
+                function: (_) @func
+                arguments: (_) @args)
+            (call
+                function: (attribute
+                    object: (_) @receiver
+                    attribute: (identifier) @method)
+                arguments: (argument_list))
+
+            ; These don't match anything, but they require additional
+            ; states to keep track of their captures.
+            (call
+                function: (_) @fn
+                arguments: (argument_list
+                    (keyword_argument
+                        name: (identifier) @name
+                        value: (_) @val) @arg) @args) @call
+            (call
+                function: (identifier) @fn
+                (#eq? @fn \"super\")) @super_call
+            ",
+        )
+        .unwrap();
+
+        assert_query_matches(
+            language,
+            &query,
+            "
+            a(1).b(2).c(3).d(4).e(5).f(6).g(7).h(8)
+            ",
+            &[
+                (0, vec![("func", "a"), ("args", "(1)")]),
+                (0, vec![("func", "a(1).b"), ("args", "(2)")]),
+                (1, vec![("receiver", "a(1)"), ("method", "b")]),
+                (0, vec![("func", "a(1).b(2).c"), ("args", "(3)")]),
+                (1, vec![("receiver", "a(1).b(2)"), ("method", "c")]),
+                (0, vec![("func", "a(1).b(2).c(3).d"), ("args", "(4)")]),
+                (1, vec![("receiver", "a(1).b(2).c(3)"), ("method", "d")]),
+                (0, vec![("func", "a(1).b(2).c(3).d(4).e"), ("args", "(5)")]),
+                (
+                    1,
+                    vec![("receiver", "a(1).b(2).c(3).d(4)"), ("method", "e")],
+                ),
+                (
+                    0,
+                    vec![("func", "a(1).b(2).c(3).d(4).e(5).f"), ("args", "(6)")],
+                ),
+                (
+                    1,
+                    vec![("receiver", "a(1).b(2).c(3).d(4).e(5)"), ("method", "f")],
+                ),
+                (
+                    0,
+                    vec![("func", "a(1).b(2).c(3).d(4).e(5).f(6).g"), ("args", "(7)")],
+                ),
+                (
+                    1,
+                    vec![
+                        ("receiver", "a(1).b(2).c(3).d(4).e(5).f(6)"),
+                        ("method", "g"),
+                    ],
+                ),
+                (
+                    0,
+                    vec![
+                        ("func", "a(1).b(2).c(3).d(4).e(5).f(6).g(7).h"),
+                        ("args", "(8)"),
+                    ],
+                ),
+                (
+                    1,
+                    vec![
+                        ("receiver", "a(1).b(2).c(3).d(4).e(5).f(6).g(7)"),
+                        ("method", "h"),
+                    ],
+                ),
+            ],
+        );
+    });
+}
+
 #[test]
 fn test_query_matches_with_indefinite_step_containing_no_captures() {
    allocations::record(|| {
@ -3031,6 +3121,7 @@ fn assert_query_matches(
    let mut cursor = QueryCursor::new();
    let matches = cursor.matches(&query, tree.root_node(), to_callback(source));
    assert_eq!(collect_matches(matches, &query, source), expected);
+    assert_eq!(cursor.did_exceed_match_limit(), false);
 }

 fn collect_matches<'a>(
--- a/lib/binding_rust/bindings.rs
+++ b/lib/binding_rust/bindings.rs
@ -720,6 +720,16 @@ extern "C" {
    #[doc = " Start running a given query on a given node."]
    pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode);
 }
+extern "C" {
+    #[doc = " Check if this cursor has exceeded its maximum number of in-progress"]
+    #[doc = " matches."]
+    #[doc = ""]
+    #[doc = " Currently, query cursors have a fixed capacity for storing lists"]
+    #[doc = " of in-progress captures. If this capacity is exceeded, then the"]
+    #[doc = " earliest-starting match will silently be dropped to make room for"]
+    #[doc = " further matches."]
+    pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool;
+}
 extern "C" {
    #[doc = " Set the range of bytes or (row, column) positions in which the query"]
    #[doc = " will be executed."]
--- a/lib/binding_rust/lib.rs
+++ b/lib/binding_rust/lib.rs
@ -1595,6 +1595,12 @@ impl QueryCursor {
        QueryCursor(unsafe { NonNull::new_unchecked(ffi::ts_query_cursor_new()) })
    }

+    /// Check if, on its last execution, this cursor exceeded its maximum number of
+    /// in-progress matches.
+    pub fn did_exceed_match_limit(&self) -> bool {
+        unsafe { ffi::ts_query_cursor_did_exceed_match_limit(self.0.as_ptr()) }
+    }
+
    /// Iterate over all of the matches in the order that they were found.
    ///
    /// Each match contains the index of the pattern that matched, and a list of captures.
--- a/lib/include/tree_sitter/api.h
+++ b/lib/include/tree_sitter/api.h
@ -791,6 +791,17 @@ void ts_query_cursor_delete(TSQueryCursor *);
 */
 void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);

+/**
+ * Check if this cursor has exceeded its maximum number of in-progress
+ * matches.
+ *
+ * Currently, query cursors have a fixed capacity for storing lists
+ * of in-progress captures. If this capacity is exceeded, then the
+ * earliest-starting match will silently be dropped to make room for
+ * further matches.
+ */
+bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *);
+
 /**
 * Set the range of bytes or (row, column) positions in which the query
 * will be executed.
--- a/lib/src/query.c
+++ b/lib/src/query.c
@ -236,6 +236,7 @@ struct TSQueryCursor {
  TSPoint end_point;
  bool ascending;
  bool halted;
+  bool did_exceed_match_limit;
 };

 static const TSQueryError PARENT_DONE = -1;
@ -2103,6 +2104,7 @@ void ts_query_disable_pattern(
 TSQueryCursor *ts_query_cursor_new(void) {
  TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
  *self = (TSQueryCursor) {
+    .did_exceed_match_limit = false,
    .ascending = false,
    .halted = false,
    .states = array_new(),
@ -2126,6 +2128,10 @@ void ts_query_cursor_delete(TSQueryCursor *self) {
  ts_free(self);
 }

+bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
+  return self->did_exceed_match_limit;
+}
+
 void ts_query_cursor_exec(
  TSQueryCursor *self,
  const TSQuery *query,
@ -2140,6 +2146,7 @@ void ts_query_cursor_exec(
  self->ascending = false;
  self->halted = false;
  self->query = query;
+  self->did_exceed_match_limit = false;
 }

 void ts_query_cursor_set_byte_range(
@ -2359,6 +2366,7 @@ static CaptureList *ts_query_cursor__prepare_to_capture(
    // state has captured the earliest node in the document, and steal its
    // capture list.
    if (state->capture_list_id == NONE) {
+      self->did_exceed_match_limit = true;
      uint32_t state_index, byte_offset, pattern_index;
      if (
        ts_query_cursor__first_in_progress_capture(
--- a/lib/src/tree_cursor.c
+++ b/lib/src/tree_cursor.c
@ -353,14 +353,12 @@ void ts_tree_cursor_current_status(
      // Determine if the current node can have later siblings with the same field name.
      if (*field_id) {
        for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
-          if (i->field_id == *field_id) {
-            if (
-              i->child_index > entry->structural_child_index ||
-              (i->child_index == entry->structural_child_index && *has_later_named_siblings)
-            ) {
-              *can_have_later_siblings_with_this_field = true;
-              break;
-            }
+          if (
+            i->field_id == *field_id &&
+            i->child_index > entry->structural_child_index
+          ) {
+            *can_have_later_siblings_with_this_field = true;
+            break;
          }
        }
      }