Allow lexer to accept tokens that ended at previous positions

* Track lookahead in each tree * Add 'mark_end' API that external scanners can use
2017-03-13 17:03:47 -07:00 · 2017-03-13 17:03:47 -07:00 · d222dbb9fd
commit d222dbb9fd
parent 12d2a9d93f
12 changed files with 96 additions and 71 deletions
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -26,6 +26,7 @@ typedef struct {

 typedef struct {
  void (*advance)(void *, bool);
+  void (*mark_end)(void *);
  int32_t lookahead;
  TSSymbol result_symbol;
 } TSLexer;
@ -91,32 +92,32 @@ typedef struct TSLanguage {
 *  Lexer Macros
 */

-#define START_LEXER() \
-  int32_t lookahead;  \
-  next_state:         \
+#define START_LEXER()           \
+  bool result = false;          \
+  int32_t lookahead;            \
+  next_state:                   \
  lookahead = lexer->lookahead;

-#define ADVANCE(state_value)                   \
-  {                                            \
+#define ADVANCE(state_value)      \
+  {                               \
    lexer->advance(lexer, false); \
-    state = state_value;                       \
-    goto next_state;                           \
+    state = state_value;          \
+    goto next_state;              \
  }

-#define SKIP(state_value)                     \
-  {                                           \
+#define SKIP(state_value)        \
+  {                              \
    lexer->advance(lexer, true); \
-    state = state_value;                      \
-    goto next_state;                          \
+    state = state_value;         \
+    goto next_state;             \
  }

-#define ACCEPT_TOKEN(symbol_value)       \
-  {                                      \
-    lexer->result_symbol = symbol_value; \
-    return true;                         \
-  }
+#define ACCEPT_TOKEN(symbol_value)     \
+  result = true;                       \
+  lexer->result_symbol = symbol_value; \
+  lexer->mark_end(lexer);

-#define LEX_ERROR() return false
+#define END_STATE() return result;

 /*
 *  Parse Table Macros
--- a/include/tree_sitter/runtime.h
+++ b/include/tree_sitter/runtime.h
@ -9,7 +9,7 @@ extern "C" {
 #include <stdint.h>
 #include <stdbool.h>

-#define TREE_SITTER_LANGUAGE_VERSION 1
+#define TREE_SITTER_LANGUAGE_VERSION 2

 typedef unsigned short TSSymbol;
 typedef struct TSLanguage TSLanguage;
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -217,9 +217,10 @@ class CCodeGenerator {
      line("START_LEXER();");
      _switch("state", [&]() {
        size_t i = 0;
-        for (const LexState &state : lex_table.states)
+        for (const LexState &state : lex_table.states) {
          _case(to_string(i++), [&]() { add_lex_state(state); });
-        _default([&]() { line("LEX_ERROR();"); });
+        }
+        _default([&]() { line("return false;"); });
      });
    });
    line("}");
@ -396,18 +397,18 @@ class CCodeGenerator {
  }

  void add_lex_state(const LexState &lex_state) {
-    if (lex_state.is_token_start)
-      line("START_TOKEN();");
+    if (lex_state.accept_action.is_present()) {
+      add_accept_token_action(lex_state.accept_action);
+    }

-    for (const auto &pair : lex_state.advance_actions)
-      if (!pair.first.is_empty())
+    for (const auto &pair : lex_state.advance_actions) {
+      if (!pair.first.is_empty()) {
        _if([&]() { add_character_set_condition(pair.first); },
            [&]() { add_advance_action(pair.second); });
+      }
+    }

-    if (lex_state.accept_action.is_present())
-      add_accept_token_action(lex_state.accept_action);
-    else
-      line("LEX_ERROR();");
+    line("END_STATE();");
  }

  void add_character_set_condition(const rules::CharacterSet &rule) {
@ -428,8 +429,7 @@ class CCodeGenerator {
      for (const auto &range : ranges) {
        if (!first) {
          add(" ||");
-          line();
-          add_padding();
+          line("  ");
        }

        add("(");
@ -442,20 +442,20 @@ class CCodeGenerator {
  }

  void add_character_range_condition(const rules::CharacterRange &range) {
-    string lookahead("lookahead");
    if (range.min == range.max) {
-      add(lookahead + " == " + escape_char(range.min));
+      add("lookahead == " + escape_char(range.min));
    } else {
-      add(escape_char(range.min) + string(" <= ") + lookahead + " && " +
-          lookahead + " <= " + escape_char(range.max));
+      add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") +
+          escape_char(range.max));
    }
  }

  void add_advance_action(const AdvanceAction &action) {
-    if (action.in_main_token)
+    if (action.in_main_token) {
      line("ADVANCE(" + to_string(action.state_index) + ");");
-    else
+    } else {
      line("SKIP(" + to_string(action.state_index) + ");");
+    }
  }

  void add_accept_token_action(const AcceptTokenAction &action) {
@ -669,7 +669,7 @@ class CCodeGenerator {

  void add_padding() {
    for (size_t i = 0; i < indent_level; i++)
-      add("    ");
+      add("  ");
  }

  void indent(function<void()> body) {
--- a/src/compiler/lex_table.cc
+++ b/src/compiler/lex_table.cc
@ -42,12 +42,9 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
         (is_string == other.is_string);
 }

-LexState::LexState() : is_token_start(false) {}
-
 bool LexState::operator==(const LexState &other) const {
  return advance_actions == other.advance_actions &&
-         accept_action == other.accept_action &&
-         is_token_start == other.is_token_start;
+         accept_action == other.accept_action;
 }

 }  // namespace tree_sitter
--- a/src/compiler/lex_table.h
+++ b/src/compiler/lex_table.h
@ -35,12 +35,10 @@ struct AcceptTokenAction {
 };

 struct LexState {
-  LexState();
  bool operator==(const LexState &) const;

  std::map<rules::CharacterSet, AdvanceAction> advance_actions;
  AcceptTokenAction accept_action;
-  bool is_token_start;
 };

 struct LexTable {
--- a/src/runtime/lexer.c
+++ b/src/runtime/lexer.c
@ -16,6 +16,8 @@

 static const char empty_chunk[2] = { 0, 0 };

+static Length unknown_length = {UINT32_MAX, 0, {0, 0}};
+
 static void ts_lexer__get_chunk(Lexer *self) {
  TSInput input = self->input;
  if (!self->chunk ||
@ -70,6 +72,11 @@ static void ts_lexer__advance(void *payload, bool skip) {
  ts_lexer__get_lookahead(self);
 }

+static void ts_lexer__mark_end(void *payload) {
+  Lexer *self = (Lexer *)payload;
+  self->token_end_position = self->current_position;
+}
+
 /*
 *  The lexer's advance method is stored as a struct field so that generated
 *  parsers can call it without needing to be linked against this library.
@ -79,6 +86,7 @@ void ts_lexer_init(Lexer *self) {
  *self = (Lexer){
    .data = {
      .advance = ts_lexer__advance,
+      .mark_end = ts_lexer__mark_end,
      .lookahead = 0,
      .result_symbol = 0,
    },
@ -95,6 +103,7 @@ void ts_lexer_init(Lexer *self) {

 static inline void ts_lexer__reset(Lexer *self, Length position) {
  self->token_start_position = position;
+  self->token_end_position = unknown_length;
  self->current_position = position;

  if (self->chunk && (position.bytes < self->chunk_start ||
@ -122,6 +131,7 @@ void ts_lexer_reset(Lexer *self, Length position) {

 void ts_lexer_start(Lexer *self) {
  self->token_start_position = self->current_position;
+  self->token_end_position = unknown_length;
  self->data.result_symbol = 0;

  if (!self->chunk)
--- a/src/runtime/lexer.h
+++ b/src/runtime/lexer.h
@ -15,11 +15,11 @@ typedef struct {
  TSLexer data;
  Length current_position;
  Length token_start_position;
+  Length token_end_position;

  const char *chunk;
  uint32_t chunk_start;
  uint32_t chunk_size;
-
  uint32_t lookahead_size;

  TSInput input;
--- a/src/runtime/parser.c
+++ b/src/runtime/parser.c
@ -279,7 +279,6 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
  if (skipped_error) {
    Length padding = length_sub(error_start_position, start_position);
    Length size = length_sub(error_end_position, error_start_position);
-    ts_lexer_reset(&self->lexer, error_end_position);
    result = ts_tree_make_error(size, padding, first_error_character);
  } else {
    TSSymbol symbol = self->lexer.data.result_symbol;
@ -287,8 +286,11 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
      symbol = self->language->external_scanner.symbol_map[symbol];
    }

+    if (length_has_unknown_chars(self->lexer.token_end_position)) {
+      self->lexer.token_end_position = self->lexer.current_position;
+    }
    Length padding = length_sub(self->lexer.token_start_position, start_position);
-    Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position);
+    Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
    TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol);
    result = ts_tree_make_leaf(symbol, padding, size, metadata);

@ -301,6 +303,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
    }
  }

+  result->bytes_scanned = self->lexer.current_position.bytes - start_position.bytes + 1;
  result->parse_state = parse_state;
  result->first_leaf.lex_mode = lex_mode;

--- a/src/runtime/tree.c
+++ b/src/runtime/tree.c
@ -155,7 +155,10 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) {
    if (i == 0) {
      self->padding = child->padding;
      self->size = child->size;
+      self->bytes_scanned = child->bytes_scanned;
    } else {
+      uint32_t bytes_scanned = ts_tree_total_bytes(self) + child->bytes_scanned;
+      if (bytes_scanned > self->bytes_scanned) self->bytes_scanned = bytes_scanned;
      self->size = length_add(self->size, ts_tree_total_size(child));
    }

@ -344,6 +347,21 @@ static inline long min(long a, long b) {
  return a <= b ? a : b;
 }

+bool ts_tree_invalidate_lookahead(Tree *self, uint32_t edit_byte_offset) {
+  if (edit_byte_offset >= self->bytes_scanned) return false;
+  self->has_changes = true;
+  if (self->child_count > 0) {
+    uint32_t child_start_byte = 0;
+    for (uint32_t i = 0; i < self->child_count; i++) {
+      Tree *child = self->children[i];
+      if (child_start_byte > edit_byte_offset) break;
+      ts_tree_invalidate_lookahead(child, edit_byte_offset - child_start_byte);
+      child_start_byte += ts_tree_total_bytes(child);
+    }
+  }
+  return true;
+}
+

 void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
  uint32_t old_end_byte = edit->start_byte + edit->bytes_removed;
@ -390,29 +408,27 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
  for (uint32_t i = 0; i < self->child_count; i++) {
    Tree *child = self->children[i];
    child_left = child_right;
+    child_right = length_add(child_left, ts_tree_total_size(child));

-    if (!found_first_child) {
-      child_right = length_add(child_left, ts_tree_total_size(child));
-      if (child_right.bytes >= edit->start_byte) {
-        found_first_child = true;
-        TSInputEdit child_edit = {
-          .start_byte = edit->start_byte - child_left.bytes,
-          .bytes_added = edit->bytes_added,
-          .bytes_removed = edit->bytes_removed,
-          .start_point = point_sub(edit->start_point, child_left.extent),
-          .extent_added = edit->extent_added,
-          .extent_removed = edit->extent_removed,
-        };
+    if (!found_first_child && child_right.bytes >= edit->start_byte) {
+      found_first_child = true;
+      TSInputEdit child_edit = {
+        .start_byte = edit->start_byte - child_left.bytes,
+        .bytes_added = edit->bytes_added,
+        .bytes_removed = edit->bytes_removed,
+        .start_point = point_sub(edit->start_point, child_left.extent),
+        .extent_added = edit->extent_added,
+        .extent_removed = edit->extent_removed,
+      };

-        if (old_end_byte > child_right.bytes) {
-          child_edit.bytes_removed = child_right.bytes - edit->start_byte;
-          child_edit.extent_removed = point_sub(child_right.extent, edit->start_point);
-          remaining_bytes_to_delete = old_end_byte - child_right.bytes;
-          remaining_extent_to_delete = point_sub(old_end_point, child_right.extent);
-        }
-
-        ts_tree_edit(child, &child_edit);
+      if (old_end_byte > child_right.bytes) {
+        child_edit.bytes_removed = child_right.bytes - edit->start_byte;
+        child_edit.extent_removed = point_sub(child_right.extent, edit->start_point);
+        remaining_bytes_to_delete = old_end_byte - child_right.bytes;
+        remaining_extent_to_delete = point_sub(old_end_point, child_right.extent);
      }
+
+      ts_tree_edit(child, &child_edit);
    } else if (remaining_bytes_to_delete > 0) {
      TSInputEdit child_edit = {
        .start_byte = 0,
@ -425,6 +441,8 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
      remaining_bytes_to_delete -= child_edit.bytes_removed;
      remaining_extent_to_delete = point_sub(remaining_extent_to_delete, child_edit.extent_removed);
      ts_tree_edit(child, &child_edit);
+    } else {
+      ts_tree_invalidate_lookahead(child, edit->start_byte - child_left.bytes);
    }

    child_right = length_add(child_left, ts_tree_total_size(child));
--- a/src/runtime/tree.h
+++ b/src/runtime/tree.h
@ -34,6 +34,7 @@ typedef struct Tree {

  Length padding;
  Length size;
+  uint32_t bytes_scanned;

  TSSymbol symbol;
  TSStateId parse_state;
--- a/test/compiler/build_tables/lex_conflict_manager_test.cc
+++ b/test/compiler/build_tables/lex_conflict_manager_test.cc
@ -69,18 +69,15 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
  describe("advance/accept-token conflicts", [&]() {
    describe("when the token to accept has higher precedence", [&]() {
      it("prefers the accept-token action", [&]() {
-        AssertThat(conflict_manager.possible_extensions, IsEmpty());
        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
        AssertThat(update, IsFalse());
-        AssertThat(conflict_manager.possible_extensions, IsEmpty());
      });
    });

    describe("when the token to accept does not have a higher precedence", [&]() {
-      it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
+      it("favors the advance action", [&]() {
        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
        AssertThat(update, IsTrue());
-        AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
      });
    });
  });
--- a/test/runtime/parser_test.cc
+++ b/test/runtime/parser_test.cc
@ -164,7 +164,7 @@ describe("Parser", [&]() {
    describe("when there is an unterminated error", [&]() {
      it("maintains a consistent tree", [&]() {
        ts_document_set_language(document, load_real_language("javascript"));
-        set_text("a; /* b");
+        set_text("a; ' this string never ends");
        assert_root_node(
          "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
      });