query: Allow configurable match limit

The default is now a whopping 64K matches, which "should be enough for everyone". You can use the new `ts_query_cursor_set_match_limit` function to set this to a lower limit, such as the previous default of 32.
2021-06-02 11:24:58 -04:00 · 2021-06-02 11:24:58 -04:00 · cd96552448
commit cd96552448
parent 78010722a4
5 changed files with 106 additions and 23 deletions
--- a/cli/src/tests/query_test.rs
+++ b/cli/src/tests/query_test.rs
@ -1625,7 +1625,6 @@ fn test_query_matches_with_different_tokens_with_the_same_string_value() {
    });
 }

-/*
 #[test]
 fn test_query_matches_with_too_many_permutations_to_track() {
    allocations::record(|| {
@ -1646,6 +1645,7 @@ fn test_query_matches_with_too_many_permutations_to_track() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let matches = cursor.matches(&query, tree.root_node(), to_callback(&source));

        // For this pathological query, some match permutations will be dropped.
@ -1687,6 +1687,7 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let matches = cursor.matches(&query, tree.root_node(), to_callback(&source));

        assert_eq!(
@ -1696,7 +1697,6 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
        assert_eq!(cursor.did_exceed_match_limit(), true);
    });
 }
-*/

 #[test]
 fn test_query_matches_with_anonymous_tokens() {
@ -2704,7 +2704,6 @@ fn test_query_captures_with_many_nested_results_with_fields() {
    });
 }

-/*
 #[test]
 fn test_query_captures_with_too_many_nested_results() {
    allocations::record(|| {
@ -2768,6 +2767,7 @@ fn test_query_captures_with_too_many_nested_results() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let captures = cursor.captures(&query, tree.root_node(), to_callback(&source));
        let captures = collect_captures(captures, &query, &source);

@ -2795,7 +2795,6 @@ fn test_query_captures_with_too_many_nested_results() {
        );
    });
 }
-*/

 #[test]
 fn test_query_captures_with_definite_pattern_containing_many_nested_matches() {
--- a/lib/binding_rust/bindings.rs
+++ b/lib/binding_rust/bindings.rs
@ -726,15 +726,27 @@ extern "C" {
    pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode);
 }
 extern "C" {
-    #[doc = " Check if this cursor has exceeded its maximum number of in-progress"]
-    #[doc = " matches."]
+    #[doc = " Manage the maximum number of in-progress matches allowed by this query"]
+    #[doc = " cursor."]
    #[doc = ""]
-    #[doc = " Currently, query cursors have a fixed capacity for storing lists"]
-    #[doc = " of in-progress captures. If this capacity is exceeded, then the"]
-    #[doc = " earliest-starting match will silently be dropped to make room for"]
-    #[doc = " further matches."]
+    #[doc = " Query cursors have a maximum capacity for storing lists of in-progress"]
+    #[doc = " captures. If this capacity is exceeded, then the earliest-starting match will"]
+    #[doc = " silently be dropped to make room for further matches."]
+    #[doc = ""]
+    #[doc = " By default, this limit is 65,536 pending matches, which is effectively"]
+    #[doc = " unlimited for most queries and syntax trees. You can optionally set this to a"]
+    #[doc = " lower number if you want to have (and check) a tighter bound on query"]
+    #[doc = " complexity."]
+    #[doc = ""]
+    #[doc = " If you update the match limit, it must be > 0 and <= 65536."]
    pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool;
 }
+extern "C" {
+    pub fn ts_query_cursor_match_limit(arg1: *const TSQueryCursor) -> u32;
+}
+extern "C" {
+    pub fn ts_query_cursor_set_match_limit(arg1: *mut TSQueryCursor, arg2: u32);
+}
 extern "C" {
    #[doc = " Set the range of bytes or (row, column) positions in which the query"]
    #[doc = " will be executed."]
--- a/lib/binding_rust/lib.rs
+++ b/lib/binding_rust/lib.rs
@ -1598,6 +1598,19 @@ impl<'a> QueryCursor {
        QueryCursor(unsafe { NonNull::new_unchecked(ffi::ts_query_cursor_new()) })
    }

+    /// Return the maximum number of in-progress matches for this cursor.
+    pub fn match_limit(&self) -> u32 {
+        unsafe { ffi::ts_query_cursor_match_limit(self.0.as_ptr()) }
+    }
+
+    /// Set the maximum number of in-progress matches for this cursor.  The limit must be > 0 and
+    /// <= 65536.
+    pub fn set_match_limit(&mut self, limit: u32) {
+        unsafe {
+            ffi::ts_query_cursor_set_match_limit(self.0.as_ptr(), limit);
+        }
+    }
+
    /// Check if, on its last execution, this cursor exceeded its maximum number of
    /// in-progress matches.
    pub fn did_exceed_match_limit(&self) -> bool {
--- a/lib/include/tree_sitter/api.h
+++ b/lib/include/tree_sitter/api.h
@ -798,15 +798,23 @@ void ts_query_cursor_delete(TSQueryCursor *);
 void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);

 /**
- * Check if this cursor has exceeded its maximum number of in-progress
- * matches.
+ * Manage the maximum number of in-progress matches allowed by this query
+ * cursor.
 *
- * Currently, query cursors have a fixed capacity for storing lists
- * of in-progress captures. If this capacity is exceeded, then the
- * earliest-starting match will silently be dropped to make room for
- * further matches.
+ * Query cursors have a maximum capacity for storing lists of in-progress
+ * captures. If this capacity is exceeded, then the earliest-starting match will
+ * silently be dropped to make room for further matches.
+ *
+ * By default, this limit is 65,536 pending matches, which is effectively
+ * unlimited for most queries and syntax trees. You can optionally set this to a
+ * lower number if you want to have (and check) a tighter bound on query
+ * complexity.
+ *
+ * If you update the match limit, it must be > 0 and <= 65536.
 */
 bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *);
+uint32_t ts_query_cursor_match_limit(const TSQueryCursor *);
+void ts_query_cursor_set_match_limit(TSQueryCursor *, uint32_t);

 /**
 * Set the range of bytes or (row, column) positions in which the query
--- a/lib/src/query.c
+++ b/lib/src/query.c
@ -175,6 +175,15 @@ typedef Array(CaptureList) CaptureListPoolEntry;
 typedef struct {
  CaptureListPoolEntry list;
  CaptureList empty_list;
+  // The maximum number of capture lists that we are allowed to allocate. We
+  // never allow `list` to allocate more entries than this, dropping pending
+  // matches if needed to stay under the limit.
+  uint32_t max_capture_list_count;
+  // The number of capture lists allocated in `list` that are not currently in
+  // use. We reuse those existing-but-unused capture lists before trying to
+  // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
+  // list's length to indicate that it's not in use.
+  uint16_t free_capture_list_count;
 } CaptureListPool;

 /*
@ -358,6 +367,10 @@ static CaptureListPool capture_list_pool_new(void) {
  return (CaptureListPool) {
    .list = array_new(),
    .empty_list = array_new(),
+    // The maximum maxmimum is 64K, since we use `uint16_t` as our capture list
+    // index type.
+    .max_capture_list_count = 65536,
+    .free_capture_list_count = 0,
  };
 }

@ -366,6 +379,7 @@ static void capture_list_pool_reset(CaptureListPool *self) {
    // This invalid size means that the list is not in use.
    self->list.contents[i].size = UINT32_MAX;
  }
+  self->free_capture_list_count = self->list.size;
 }

 static void capture_list_pool_delete(CaptureListPool *self) {
@ -385,17 +399,30 @@ static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id
  return &self->list.contents[id];
 }

+static bool capture_list_pool_is_empty(const CaptureListPool *self) {
+  // The capture list pool is empty if all allocated lists are in use, and we
+  // have reached the maximum allowed number of allocated lists.
+  return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count;
+}
+
 static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
-  // First see if any already allocated capture lists are currently unused.
-  for (uint16_t i = 0; i < self->list.size; i++) {
-    if (self->list.contents[i].size == UINT32_MAX) {
-      array_clear(&self->list.contents[i]);
-      return i;
+  // First see if any already allocated capture list is currently unused.
+  if (self->free_capture_list_count > 0) {
+    for (uint16_t i = 0; i < self->list.size; i++) {
+      if (self->list.contents[i].size == UINT32_MAX) {
+        array_clear(&self->list.contents[i]);
+        self->free_capture_list_count--;
+        return i;
+      }
    }
  }

-  // Otherwise allocate and initialize a new capture list.
-  uint16_t i = self->list.size;
+  // Otherwise allocate and initialize a new capture list, as long as that
+  // doesn't put us over the requested maximum.
+  uint32_t i = self->list.size;
+  if (i >= self->max_capture_list_count) {
+    return NONE;
+  }
  CaptureList list;
  array_init(&list);
  array_push(&self->list, list);
@ -405,6 +432,7 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
  if (id >= self->list.size) return;
  self->list.contents[id].size = UINT32_MAX;
+  self->free_capture_list_count++;
 }

 /**************
@ -2285,6 +2313,15 @@ bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
  return self->did_exceed_match_limit;
 }

+uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) {
+  return self->capture_list_pool.max_capture_list_count;
+}
+
+void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) {
+  assert(limit > 0 && limit <= 65536);
+  self->capture_list_pool.max_capture_list_count = limit;
+}
+
 void ts_query_cursor_exec(
  TSQueryCursor *self,
  const TSQuery *query,
@ -3186,6 +3223,20 @@ bool ts_query_cursor_next_capture(
      return true;
    }

+    if (capture_list_pool_is_empty(&self->capture_list_pool)) {
+      LOG(
+        "  abandon state. index:%u, pattern:%u, offset:%u.\n",
+        first_unfinished_state_index,
+        first_unfinished_pattern_index,
+        first_unfinished_capture_byte
+      );
+      capture_list_pool_release(
+        &self->capture_list_pool,
+        self->states.contents[first_unfinished_state_index].capture_list_id
+      );
+      array_erase(&self->states, first_unfinished_state_index);
+    }
+
    // If there are no finished matches that are ready to be returned, then
    // continue finding more matches.
    if (