Merge branch 'master' into query-cursor-api

2021-06-02 11:40:48 -07:00 · 2021-06-02 11:40:48 -07:00 · f3ea60e23f
commit f3ea60e23f
parent 8416894923 82f3d3232b
11 changed files with 150 additions and 119 deletions
--- a/cli/src/tests/query_test.rs
+++ b/cli/src/tests/query_test.rs
@ -1645,6 +1645,7 @@ fn test_query_matches_with_too_many_permutations_to_track() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let matches = cursor.matches(&query, tree.root_node(), source.as_bytes());

        // For this pathological query, some match permutations will be dropped.
@ -1686,6 +1687,7 @@ fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let matches = cursor.matches(&query, tree.root_node(), source.as_bytes());

        assert_eq!(
@ -2905,6 +2907,7 @@ fn test_query_captures_with_too_many_nested_results() {
        parser.set_language(language).unwrap();
        let tree = parser.parse(&source, None).unwrap();
        let mut cursor = QueryCursor::new();
+        cursor.set_match_limit(32);
        let captures = cursor.captures(&query, tree.root_node(), source.as_bytes());
        let captures = collect_captures(captures, &query, &source);

--- a/cli/src/wasm.rs
+++ b/cli/src/wasm.rs
@ -3,7 +3,6 @@ use super::generate::parse_grammar::GrammarJSON;
 use std::ffi::{OsStr, OsString};
 use std::fs;
 use std::path::Path;
-use std::path::PathBuf;
 use std::process::Command;
 use which::which;

@ -23,15 +22,15 @@ pub fn compile_language_to_wasm(language_dir: &Path, force_docker: bool) -> Resu
    let grammar_name = get_grammar_name(&src_dir)?;
    let output_filename = format!("tree-sitter-{}.wasm", grammar_name);

+    let emcc_bin = if cfg!(windows) { "emcc.bat" } else { "emcc" };
+    let emcc_path = which(emcc_bin)
+        .ok()
+        .and_then(|p| Command::new(&p).output().and(Ok(p)).ok());
+
    let mut command;
-    if !force_docker {
-        let emcc_path = get_emcc_path();
-        if emcc_path.is_ok() {
-            command = Command::new(emcc_path.unwrap());
-            command.current_dir(&language_dir);
-        } else {
-            return Err(emcc_path.unwrap_err());
-        }
+    if !force_docker && emcc_path.is_some() {
+        command = Command::new(emcc_path.unwrap());
+        command.current_dir(&language_dir);
    } else if Command::new("docker").output().is_ok() {
        command = Command::new("docker");
        command.args(&["run", "--rm"]);
@ -123,23 +122,3 @@ pub fn compile_language_to_wasm(language_dir: &Path, force_docker: bool) -> Resu

    Ok(())
 }
-
-fn get_emcc_path() -> Result<PathBuf> {
-    let emcc_bin;
-    if cfg!(windows) {
-        emcc_bin = "emcc.bat";
-    } else {
-        emcc_bin = "emcc";
-    };
-    let emcc_which = which(emcc_bin);
-    let emcc_path;
-    if emcc_which.is_ok() {
-        emcc_path = emcc_which.unwrap();
-    } else {
-        return Error::err("emcc was not found on PATH".to_string());
-    }
-    if Command::new(&emcc_path).output().is_ok() {
-        return Ok(emcc_path);
-    }
-    return Error::err("emcc binary doesn't work properly".to_string());
-}
--- a/docs/section-6-contributing.md
+++ b/docs/section-6-contributing.md
@ -106,7 +106,7 @@ The main [`tree-sitter/tree-sitter`](https://github.com/tree-sitter/tree-sitter)

 There are also several other dependent repositories that contain other published packages:

- [`tree-sitter/node-tree-sitter`](https://github.com/tree-sitter/py-tree-sitter) - Node.js bindings to the core library, published as [`tree-sitter`](https://www.npmjs.com/package/tree-sitter) on npmjs.com
+- [`tree-sitter/node-tree-sitter`](https://github.com/tree-sitter/node-tree-sitter) - Node.js bindings to the core library, published as [`tree-sitter`](https://www.npmjs.com/package/tree-sitter) on npmjs.com
 - [`tree-sitter/py-tree-sitter`](https://github.com/tree-sitter/py-tree-sitter) - Python bindings to the core library, published as [`tree-sitter`](https://pypi.org/project/tree-sitter) on [PyPI.org](https://pypi.org).

 ## Publishing New Releases
--- a/lib/binding_rust/bindings.rs
+++ b/lib/binding_rust/bindings.rs
@ -730,15 +730,27 @@ extern "C" {
    pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode);
 }
 extern "C" {
-    #[doc = " Check if this cursor has exceeded its maximum number of in-progress"]
-    #[doc = " matches."]
+    #[doc = " Manage the maximum number of in-progress matches allowed by this query"]
+    #[doc = " cursor."]
    #[doc = ""]
-    #[doc = " Currently, query cursors have a fixed capacity for storing lists"]
-    #[doc = " of in-progress captures. If this capacity is exceeded, then the"]
-    #[doc = " earliest-starting match will silently be dropped to make room for"]
-    #[doc = " further matches."]
+    #[doc = " Query cursors have a maximum capacity for storing lists of in-progress"]
+    #[doc = " captures. If this capacity is exceeded, then the earliest-starting match will"]
+    #[doc = " silently be dropped to make room for further matches."]
+    #[doc = ""]
+    #[doc = " By default, this limit is 65,536 pending matches, which is effectively"]
+    #[doc = " unlimited for most queries and syntax trees. You can optionally set this to a"]
+    #[doc = " lower number if you want to have (and check) a tighter bound on query"]
+    #[doc = " complexity."]
+    #[doc = ""]
+    #[doc = " If you update the match limit, it must be > 0 and <= 65536."]
    pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool;
 }
+extern "C" {
+    pub fn ts_query_cursor_match_limit(arg1: *const TSQueryCursor) -> u32;
+}
+extern "C" {
+    pub fn ts_query_cursor_set_match_limit(arg1: *mut TSQueryCursor, arg2: u32);
+}
 extern "C" {
    #[doc = " Set the range of bytes or (row, column) positions in which the query"]
    #[doc = " will be executed."]
--- a/lib/binding_rust/lib.rs
+++ b/lib/binding_rust/lib.rs
@ -1649,6 +1649,19 @@ impl QueryCursor {
        }
    }

+    /// Return the maximum number of in-progress matches for this cursor.
+    pub fn match_limit(&self) -> u32 {
+        unsafe { ffi::ts_query_cursor_match_limit(self.ptr.as_ptr()) }
+    }
+
+    /// Set the maximum number of in-progress matches for this cursor.  The limit must be > 0 and
+    /// <= 65536.
+    pub fn set_match_limit(&mut self, limit: u32) {
+        unsafe {
+            ffi::ts_query_cursor_set_match_limit(self.ptr.as_ptr(), limit);
+        }
+    }
+
    /// Check if, on its last execution, this cursor exceeded its maximum number of
    /// in-progress matches.
    pub fn did_exceed_match_limit(&self) -> bool {
--- a/lib/binding_web/binding.c
+++ b/lib/binding_web/binding.c
@ -594,9 +594,15 @@ void ts_query_matches_wasm(
  uint32_t start_row,
  uint32_t start_column,
  uint32_t end_row,
-  uint32_t end_column
+  uint32_t end_column,
+  uint32_t match_limit
 ) {
  if (!scratch_query_cursor) scratch_query_cursor = ts_query_cursor_new();
+  if (match_limit == 0) {
+    ts_query_cursor_set_match_limit(scratch_query_cursor, UINT32_MAX);
+  } else {
+    ts_query_cursor_set_match_limit(scratch_query_cursor, match_limit);
+  }

  TSNode node = unmarshal_node(tree);
  TSPoint start_point = {start_row, code_unit_to_byte(start_column)};
@ -635,9 +641,15 @@ void ts_query_captures_wasm(
  uint32_t start_row,
  uint32_t start_column,
  uint32_t end_row,
-  uint32_t end_column
+  uint32_t end_column,
+  uint32_t match_limit
 ) {
  if (!scratch_query_cursor) scratch_query_cursor = ts_query_cursor_new();
+  if (match_limit == 0) {
+    ts_query_cursor_set_match_limit(scratch_query_cursor, UINT32_MAX);
+  } else {
+    ts_query_cursor_set_match_limit(scratch_query_cursor, match_limit);
+  }

  TSNode node = unmarshal_node(tree);
  TSPoint start_point = {start_row, code_unit_to_byte(start_column)};
--- a/lib/binding_web/binding.js
+++ b/lib/binding_web/binding.js
@ -953,9 +953,17 @@ class Query {
    this[0] = 0;
  }

-  matches(node, startPosition, endPosition) {
+  matches(node, startPosition, endPosition, options) {
    if (!startPosition) startPosition = ZERO_POINT;
    if (!endPosition) endPosition = ZERO_POINT;
+    if (!options) options = {};
+
+    let matchLimit = options.matchLimit;
+    if (typeof matchLimit === 'undefined') {
+      matchLimit = 0;
+    } else if (typeof matchLimit !== 'number') {
+      throw new Error('Arguments must be numbers');
+    }

    marshalNode(node);

@ -965,7 +973,8 @@ class Query {
      startPosition.row,
      startPosition.column,
      endPosition.row,
-      endPosition.column
+      endPosition.column,
+      matchLimit
    );

    const rawCount = getValue(TRANSFER_BUFFER, 'i32');
@ -1000,9 +1009,17 @@ class Query {
    return result;
  }

-  captures(node, startPosition, endPosition) {
+  captures(node, startPosition, endPosition, options) {
    if (!startPosition) startPosition = ZERO_POINT;
    if (!endPosition) endPosition = ZERO_POINT;
+    if (!options) options = {};
+
+    let matchLimit = options.matchLimit;
+    if (typeof matchLimit === 'undefined') {
+      matchLimit = 0;
+    } else if (typeof matchLimit !== 'number') {
+      throw new Error('Arguments must be numbers');
+    }

    marshalNode(node);

@ -1012,7 +1029,8 @@ class Query {
      startPosition.row,
      startPosition.column,
      endPosition.row,
-      endPosition.column
+      endPosition.column,
+      matchLimit
    );

    const count = getValue(TRANSFER_BUFFER, 'i32');
--- a/lib/binding_web/test/query-test.js
+++ b/lib/binding_web/test/query-test.js
@ -256,7 +256,7 @@ describe("Query", () => {
        (array (identifier) @pre (identifier) @post)
      `);

-      const captures = query.captures(tree.rootNode);
+      const captures = query.captures(tree.rootNode, null, null, {matchLimit: 32});
      assert.ok(query.didExceedMatchLimit());
    });
  });
--- a/lib/include/tree_sitter/api.h
+++ b/lib/include/tree_sitter/api.h
@ -799,15 +799,19 @@ void ts_query_cursor_delete(TSQueryCursor *);
 void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);

 /**
- * Check if this cursor has exceeded its maximum number of in-progress
- * matches.
+ * Manage the maximum number of in-progress matches allowed by this query
+ * cursor.
 *
- * Currently, query cursors have a fixed capacity for storing lists
- * of in-progress captures. If this capacity is exceeded, then the
- * earliest-starting match will silently be dropped to make room for
- * further matches.
+ * Query cursors have an optional maximum capacity for storing lists of
+ * in-progress captures. If this capacity is exceeded, then the
+ * earliest-starting match will silently be dropped to make room for further
+ * matches. This maximum capacity is optional — by default, query cursors allow
+ * any number of pending matches, dynamically allocating new space for them as
+ * needed as the query is executed.
 */
 bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *);
+uint32_t ts_query_cursor_match_limit(const TSQueryCursor *);
+void ts_query_cursor_set_match_limit(TSQueryCursor *, uint32_t);

 /**
 * Set the range of bytes or (row, column) positions in which the query
--- a/lib/src/bits.h
+++ b/lib/src/bits.h
@ -1,42 +0,0 @@
-#ifndef TREE_SITTER_BITS_H_
-#define TREE_SITTER_BITS_H_
-
-#include <stdint.h>
-
-static inline uint32_t bitmask_for_index(uint16_t id) {
-  return (1u << (31 - id));
-}
-
-#ifdef __TINYC__
-
-// Algorithm taken from the Hacker's Delight book
-// See also https://graphics.stanford.edu/~seander/bithacks.html
-static inline uint32_t count_leading_zeros(uint32_t x) {
-  int count = 0;
-  if (x == 0) return 32;
-  x = x - ((x >> 1) & 0x55555555);
-  x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-  count = (((x + (x >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24;
-  return count;
-}
-
-#elif defined _WIN32 && !defined __GNUC__
-
-#include <intrin.h>
-
-static inline uint32_t count_leading_zeros(uint32_t x) {
-  if (x == 0) return 32;
-  uint32_t result;
-  _BitScanReverse(&result, x);
-  return 31 - result;
-}
-
-#else
-
-static inline uint32_t count_leading_zeros(uint32_t x) {
-  if (x == 0) return 32;
-  return __builtin_clz(x);
-}
-
-#endif
-#endif  // TREE_SITTER_BITS_H_
--- a/lib/src/query.c
+++ b/lib/src/query.c
@ -1,7 +1,6 @@
 #include "tree_sitter/api.h"
 #include "./alloc.h"
 #include "./array.h"
-#include "./bits.h"
 #include "./language.h"
 #include "./point.h"
 #include "./tree_cursor.h"
@ -12,7 +11,6 @@
 // #define LOG(...) fprintf(stderr, __VA_ARGS__)
 #define LOG(...)

-#define MAX_CAPTURE_LIST_COUNT 32
 #define MAX_STEP_CAPTURE_COUNT 3
 #define MAX_STATE_PREDECESSOR_COUNT 100
 #define MAX_ANALYSIS_STATE_DEPTH 12
@ -157,10 +155,10 @@ typedef struct {
 */
 typedef struct {
  uint32_t id;
+  uint32_t capture_list_id;
  uint16_t start_depth;
  uint16_t step_index;
  uint16_t pattern_index;
-  uint16_t capture_list_id;
  uint16_t consumed_capture_count: 12;
  bool seeking_immediate_match: 1;
  bool has_in_progress_alternatives: 1;
@ -177,9 +175,17 @@ typedef Array(TSQueryCapture) CaptureList;
 * currently in use by a query state.
 */
 typedef struct {
-  CaptureList list[MAX_CAPTURE_LIST_COUNT];
+  Array(CaptureList) list;
  CaptureList empty_list;
-  uint32_t usage_map;
+  // The maximum number of capture lists that we are allowed to allocate. We
+  // never allow `list` to allocate more entries than this, dropping pending
+  // matches if needed to stay under the limit.
+  uint32_t max_capture_list_count;
+  // The number of capture lists allocated in `list` that are not currently in
+  // use. We reuse those existing-but-unused capture lists before trying to
+  // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
+  // list's length to indicate that it's not in use.
+  uint32_t free_capture_list_count;
 } CaptureListPool;

 /*
@ -361,54 +367,72 @@ static uint32_t stream_offset(Stream *self) {

 static CaptureListPool capture_list_pool_new(void) {
  return (CaptureListPool) {
+    .list = array_new(),
    .empty_list = array_new(),
-    .usage_map = UINT32_MAX,
+    .max_capture_list_count = UINT32_MAX,
+    .free_capture_list_count = 0,
  };
 }

 static void capture_list_pool_reset(CaptureListPool *self) {
-  self->usage_map = UINT32_MAX;
-  for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) {
-    array_clear(&self->list[i]);
+  for (uint16_t i = 0; i < self->list.size; i++) {
+    // This invalid size means that the list is not in use.
+    self->list.contents[i].size = UINT32_MAX;
  }
+  self->free_capture_list_count = self->list.size;
 }

 static void capture_list_pool_delete(CaptureListPool *self) {
-  for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) {
-    array_delete(&self->list[i]);
+  for (uint16_t i = 0; i < self->list.size; i++) {
+    array_delete(&self->list.contents[i]);
  }
+  array_delete(&self->list);
 }

 static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) {
-  if (id >= MAX_CAPTURE_LIST_COUNT) return &self->empty_list;
-  return &self->list[id];
+  if (id >= self->list.size) return &self->empty_list;
+  return &self->list.contents[id];
 }

 static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) {
-  assert(id < MAX_CAPTURE_LIST_COUNT);
-  return &self->list[id];
+  assert(id < self->list.size);
+  return &self->list.contents[id];
 }

 static bool capture_list_pool_is_empty(const CaptureListPool *self) {
-  return self->usage_map == 0;
+  // The capture list pool is empty if all allocated lists are in use, and we
+  // have reached the maximum allowed number of allocated lists.
+  return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count;
 }

 static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
-  // In the usage_map bitmask, ones represent free lists, and zeros represent
-  // lists that are in use. A free list id can quickly be found by counting
-  // the leading zeros in the usage map. An id of zero corresponds to the
-  // highest-order bit in the bitmask.
-  uint16_t id = count_leading_zeros(self->usage_map);
-  if (id >= MAX_CAPTURE_LIST_COUNT) return NONE;
-  self->usage_map &= ~bitmask_for_index(id);
-  array_clear(&self->list[id]);
-  return id;
+  // First see if any already allocated capture list is currently unused.
+  if (self->free_capture_list_count > 0) {
+    for (uint16_t i = 0; i < self->list.size; i++) {
+      if (self->list.contents[i].size == UINT32_MAX) {
+        array_clear(&self->list.contents[i]);
+        self->free_capture_list_count--;
+        return i;
+      }
+    }
+  }
+
+  // Otherwise allocate and initialize a new capture list, as long as that
+  // doesn't put us over the requested maximum.
+  uint32_t i = self->list.size;
+  if (i >= self->max_capture_list_count) {
+    return NONE;
+  }
+  CaptureList list;
+  array_init(&list);
+  array_push(&self->list, list);
+  return i;
 }

 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
-  if (id >= MAX_CAPTURE_LIST_COUNT) return;
-  array_clear(&self->list[id]);
-  self->usage_map |= bitmask_for_index(id);
+  if (id >= self->list.size) return;
+  self->list.contents[id].size = UINT32_MAX;
+  self->free_capture_list_count++;
 }

 /**************
@ -2302,6 +2326,14 @@ bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
  return self->did_exceed_match_limit;
 }

+uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) {
+  return self->capture_list_pool.max_capture_list_count;
+}
+
+void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) {
+  self->capture_list_pool.max_capture_list_count = limit;
+}
+
 void ts_query_cursor_exec(
  TSQueryCursor *self,
  const TSQuery *query,