Merge pull request #183 from tree-sitter/detect-included-range-boundaries

Add lexer API for detecting boundaries of included ranges
2018-07-18 09:37:16 -07:00 · 2018-07-18 09:37:16 -07:00 · 16376c43f5
commit 16376c43f5
parent d54412266e 9ecb20650b
11 changed files with 95 additions and 50 deletions
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -25,13 +25,16 @@ typedef struct {
  bool named : 1;
 } TSSymbolMetadata;

-typedef struct {
-  void (*advance)(void *, bool);
-  void (*mark_end)(void *);
-  uint32_t (*get_column)(void *);
+typedef struct TSLexer TSLexer;
+
+struct TSLexer {
  int32_t lookahead;
  TSSymbol result_symbol;
-} TSLexer;
+  void (*advance)(TSLexer *, bool);
+  void (*mark_end)(TSLexer *);
+  uint32_t (*get_column)(TSLexer *);
+  bool (*is_at_included_range_start)(TSLexer *);
+};

 typedef enum {
  TSParseActionTypeShift,
--- a/include/tree_sitter/runtime.h
+++ b/include/tree_sitter/runtime.h
@ -10,7 +10,7 @@ extern "C" {
 #include <stdint.h>
 #include <stdbool.h>

-#define TREE_SITTER_LANGUAGE_VERSION 8
+#define TREE_SITTER_LANGUAGE_VERSION 9

 typedef uint16_t TSSymbol;
 typedef struct TSLanguage TSLanguage;
--- a/script/fetch-fixtures
+++ b/script/fetch-fixtures
@ -15,13 +15,13 @@ fetch_grammar() {
  fi

  (
-    cd $grammar_dir;
+    cd $grammar_dir
    git fetch origin $ref --depth=1
-    git reset --hard origin/$ref;
+    git reset --hard FETCH_HEAD
  )
 }

-fetch_grammar javascript master
+fetch_grammar javascript included-range-boundaries
 fetch_grammar json       master
 fetch_grammar c          master
 fetch_grammar cpp        master
--- a/script/fetch-fixtures.cmd
+++ b/script/fetch-fixtures.cmd
@ -1,6 +1,6 @@
@echo off

-call:fetch_grammar javascript master
+call:fetch_grammar javascript included-range-boundaries
 call:fetch_grammar json       master
 call:fetch_grammar c          master
 call:fetch_grammar cpp        master
@ -22,6 +22,6 @@ SET grammar_branch=%~2
 )
 pushd %grammar_dir%
 git fetch origin %2 --depth=1
-git reset --hard origin/%grammar_branch%
+git reset --hard FETCH_HEAD
 popd
 EXIT /B 0
--- a/src/runtime/lexer.c
+++ b/src/runtime/lexer.c
@ -50,7 +50,7 @@ static void ts_lexer__get_lookahead(Lexer *self) {
  }
 }

-static void ts_lexer__advance(void *payload, bool skip) {
+static void ts_lexer__advance(TSLexer *payload, bool skip) {
  Lexer *self = (Lexer *)payload;
  if (self->chunk == empty_chunk)
    return;
@ -95,7 +95,7 @@ static void ts_lexer__advance(void *payload, bool skip) {
  ts_lexer__get_lookahead(self);
 }

-static void ts_lexer__mark_end(void *payload) {
+static void ts_lexer__mark_end(TSLexer *payload) {
  Lexer *self = (Lexer *)payload;
  TSRange *current_included_range = &self->included_ranges[self->current_included_range_index];
  if (self->current_included_range_index > 0 &&
@ -110,7 +110,7 @@ static void ts_lexer__mark_end(void *payload) {
  }
 }

-static uint32_t ts_lexer__get_column(void *payload) {
+static uint32_t ts_lexer__get_column(TSLexer *payload) {
  Lexer *self = (Lexer *)payload;
  uint32_t goal_byte = self->current_position.bytes;

@ -123,13 +123,19 @@ static uint32_t ts_lexer__get_column(void *payload) {

  uint32_t result = 0;
  while (self->current_position.bytes < goal_byte) {
-    ts_lexer__advance(self, false);
+    ts_lexer__advance(payload, false);
    result++;
  }

  return result;
 }

+static bool ts_lexer__is_at_included_range_start(TSLexer *payload) {
+  const Lexer *self = (const Lexer *)payload;
+  TSRange *current_range = &self->included_ranges[self->current_included_range_index];
+  return self->current_position.bytes == current_range->start_byte;
+}
+
 // The lexer's methods are stored as a struct field so that generated
 // parsers can call them without needing to be linked against this library.

@ -139,6 +145,7 @@ void ts_lexer_init(Lexer *self) {
      .advance = ts_lexer__advance,
      .mark_end = ts_lexer__mark_end,
      .get_column = ts_lexer__get_column,
+      .is_at_included_range_start = ts_lexer__is_at_included_range_start,
      .lookahead = 0,
      .result_symbol = 0,
    },
@ -227,7 +234,9 @@ void ts_lexer_start(Lexer *self) {
 }

 void ts_lexer_advance_to_end(Lexer *self) {
-  while (self->data.lookahead != 0) ts_lexer__advance(self, false);
+  while (self->data.lookahead != 0) {
+    ts_lexer__advance((TSLexer *)self, false);
+  }
 }

 static const TSRange DEFAULT_RANGES[] = {
--- a/src/runtime/parser.c
+++ b/src/runtime/parser.c
@ -327,7 +327,7 @@ static const Subtree *ts_parser__lex(TSParser *self, StackVersion version, TSSta
        valid_external_tokens
      )) {
        if (length_is_undefined(self->lexer.token_end_position)) {
-          self->lexer.token_end_position = self->lexer.current_position;
+          self->lexer.data.mark_end(&self->lexer.data);
        }

        if (!error_mode || self->lexer.token_end_position.bytes > current_position.bytes) {
@ -380,7 +380,7 @@ static const Subtree *ts_parser__lex(TSParser *self, StackVersion version, TSSta
        self->lexer.data.result_symbol = ts_builtin_sym_error;
        break;
      }
-      self->lexer.data.advance(&self->lexer, false);
+      self->lexer.data.advance(&self->lexer.data, false);
    }

    error_end_position = self->lexer.current_position;
--- a/test/helpers/point_helpers.cc
+++ b/test/helpers/point_helpers.cc
@ -1,6 +1,7 @@
 #include "./point_helpers.h"
 #include <string>
 #include <ostream>
+#include <cassert>
 #include "runtime/length.h"
 #include "tree_sitter/runtime.h"

@ -45,3 +46,29 @@ std::ostream &operator<<(std::ostream &stream, const TSRange &range) {
 ostream &operator<<(ostream &stream, const Length &length) {
  return stream << "{bytes:" << length.bytes << ", extent:" << length.extent << "}";
 }
+
+TSPoint extent_for_string(const string &text, size_t end_index) {
+  if (end_index > text.size()) end_index = text.size();
+  TSPoint result = {0, 0};
+  for (size_t i = 0; i < end_index; i++) {
+    if (text[i] == '\n') {
+      result.row++;
+      result.column = 0;
+    } else {
+      result.column++;
+    }
+  }
+  return result;
+}
+
+TSRange range_for_substring(const string &text, const string &substring) {
+  size_t start = text.find(substring);
+  assert(start != string::npos);
+  size_t end = start + substring.size();
+  return TSRange {
+    extent_for_string(text, start),
+    extent_for_string(text, end),
+    static_cast<uint32_t>(start),
+    static_cast<uint32_t>(end),
+  };
+};
--- a/test/helpers/point_helpers.h
+++ b/test/helpers/point_helpers.h
@ -20,4 +20,8 @@ std::ostream &operator<<(std::ostream &stream, const TSRange &range);

 std::ostream &operator<<(std::ostream &stream, const Length &length);

+TSPoint extent_for_string(const std::string &text, size_t end_index = std::string::npos);
+
+TSRange range_for_substring(const std::string &text, const std::string &substring);
+
 #endif  // HELPERS_POINT_HELPERS_H_
--- a/test/helpers/spy_input.cc
+++ b/test/helpers/spy_input.cc
@ -1,5 +1,6 @@
 #include "helpers/spy_input.h"
 #include "helpers/encoding_helpers.h"
+#include "helpers/point_helpers.h"
 #include "runtime/point.h"
 #include <string.h>
 #include <algorithm>
@ -95,19 +96,6 @@ TSInput SpyInput::input() {
  return result;
 }

-static TSPoint get_extent(string text) {
-  TSPoint result = {0, 0};
-  for (auto i = text.begin(); i != text.end(); i++) {
-    if (*i == '\n') {
-      result.row++;
-      result.column = 0;
-    } else {
-      result.column++;
-    }
-  }
-  return result;
-}
-
 TSInputEdit SpyInput::replace(size_t start_byte, size_t bytes_removed, string text) {
  auto swap = swap_substr(start_byte, bytes_removed, text);
  size_t bytes_added = text.size();
@ -117,8 +105,8 @@ TSInputEdit SpyInput::replace(size_t start_byte, size_t bytes_removed, string te
  result.old_end_byte = start_byte + bytes_removed;
  result.new_end_byte = start_byte + bytes_added;
  result.start_point = swap.second;
-  result.old_end_point = result.start_point + get_extent(swap.first);
-  result.new_end_point = result.start_point + get_extent(text);
+  result.old_end_point = result.start_point + extent_for_string(swap.first);
+  result.new_end_point = result.start_point + extent_for_string(text);
  return result;
 }

@ -131,8 +119,8 @@ TSInputEdit SpyInput::undo() {
  result.old_end_byte = entry.start_byte + entry.bytes_removed;
  result.new_end_byte = entry.start_byte + entry.text_inserted.size();
  result.start_point = swap.second;
-  result.old_end_point = result.start_point + get_extent(swap.first);
-  result.new_end_point = result.start_point + get_extent(entry.text_inserted);
+  result.old_end_point = result.start_point + extent_for_string(swap.first);
+  result.new_end_point = result.start_point + extent_for_string(entry.text_inserted);
  return result;
 }

--- a/test/runtime/parser_test.cc
+++ b/test/runtime/parser_test.cc
@ -918,6 +918,29 @@ describe("Parser", [&]() {

      assert_root_node("(program (ERROR (identifier)))");
    });
+
+    it("allows external scanners to detect the boundaries of included ranges", [&]() {
+      string source_code = "a <%= b() %> c <% d() %>";
+
+      TSRange included_ranges[] = {
+        range_for_substring(source_code, "b()"),
+        range_for_substring(source_code, "d()"),
+      };
+
+      ts_parser_set_included_ranges(parser, included_ranges, 2);
+      ts_parser_set_language(parser, load_real_language("javascript"));
+      tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size());
+
+      assert_root_node("(program "
+        "(expression_statement (call_expression (identifier) (arguments))) "
+        "(expression_statement (call_expression (identifier) (arguments))))");
+
+      TSNode statement_node1 = ts_node_child(ts_tree_root_node(tree), 0);
+      TSNode statement_node2 = ts_node_child(ts_tree_root_node(tree), 1);
+
+      AssertThat(ts_node_end_point(statement_node1), Equals(extent_for_string("a <%= b()")));
+      AssertThat(ts_node_end_point(statement_node2), Equals(extent_for_string("a <%= b() %> c <% d()")));
+    });
  });
 });

--- a/test/runtime/tree_test.cc
+++ b/test/runtime/tree_test.cc
@ -131,22 +131,13 @@ describe("Tree", [&]() {
      return result;
    };

-    auto range_for_text = [&](string start_text, string end_text) {
-      return TSRange {
-        point(0, input->content.find(start_text)),
-        point(0, input->content.find(end_text)),
-        static_cast<uint32_t>(input->content.find(start_text)),
-        static_cast<uint32_t>(input->content.find(end_text)),
-      };
-    };
-
    it("reports changes when one token has been updated", [&]() {
      // Replace `null` with `nothing`
      auto ranges = get_changed_ranges_for_edit([&]() {
-        return input->replace(input->content.find("ull"), 1, "othing");
+        return input->replace(input->content.find("ull"), 3, "othing");
      });
      AssertThat(ranges, Equals(vector<TSRange>({
-        range_for_text("nothing", "}"),
+        range_for_substring(input->content, "nothing"),
      })));

      // Replace `nothing` with `null` again
@ -154,7 +145,7 @@ describe("Tree", [&]() {
        return input->undo();
      });
      AssertThat(ranges, Equals(vector<TSRange>({
-        range_for_text("null", "}"),
+        range_for_substring(input->content, "null"),
      })));
    });

@ -195,7 +186,7 @@ describe("Tree", [&]() {
        return input->replace(input->content.find("}"), 0, ", b: false");
      });
      AssertThat(ranges, Equals(vector<TSRange>({
-        range_for_text(",", "}"),
+        range_for_substring(input->content, ", b: false"),
      })));

      // Add a third key-value pair in between the first two
@ -209,7 +200,7 @@ describe("Tree", [&]() {
          "(pair (property_identifier) (false)))))"
      );
      AssertThat(ranges, Equals(vector<TSRange>({
-        range_for_text(", c", ", b"),
+        range_for_substring(input->content, ", c: 1"),
      })));

      // Delete the middle pair.
@ -244,7 +235,7 @@ describe("Tree", [&]() {
          "(pair (property_identifier) (binary_expression (identifier) (null))))))"
      );
      AssertThat(ranges, Equals(vector<TSRange>({
-        range_for_text("b ===", "}"),
+        range_for_substring(input->content, "b === null"),
      })));
    });
  });