diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 197015f4..6e68a9f4 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -26,6 +26,7 @@ typedef struct { typedef struct { void (*advance)(void *, bool); + void (*mark_end)(void *); int32_t lookahead; TSSymbol result_symbol; } TSLexer; @@ -91,32 +92,32 @@ typedef struct TSLanguage { * Lexer Macros */ -#define START_LEXER() \ - int32_t lookahead; \ - next_state: \ +#define START_LEXER() \ + bool result = false; \ + int32_t lookahead; \ + next_state: \ lookahead = lexer->lookahead; -#define ADVANCE(state_value) \ - { \ +#define ADVANCE(state_value) \ + { \ lexer->advance(lexer, false); \ - state = state_value; \ - goto next_state; \ + state = state_value; \ + goto next_state; \ } -#define SKIP(state_value) \ - { \ +#define SKIP(state_value) \ + { \ lexer->advance(lexer, true); \ - state = state_value; \ - goto next_state; \ + state = state_value; \ + goto next_state; \ } -#define ACCEPT_TOKEN(symbol_value) \ - { \ - lexer->result_symbol = symbol_value; \ - return true; \ - } +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); -#define LEX_ERROR() return false +#define END_STATE() return result; /* * Parse Table Macros diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 69334ad0..cc551ebb 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,7 +9,7 @@ extern "C" { #include #include -#define TREE_SITTER_LANGUAGE_VERSION 1 +#define TREE_SITTER_LANGUAGE_VERSION 2 typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 111340c1..d592966a 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -217,9 +217,10 @@ class CCodeGenerator { line("START_LEXER();"); _switch("state", [&]() { size_t i = 0; - for (const LexState &state : lex_table.states) + for (const LexState &state : lex_table.states) { _case(to_string(i++), [&]() { add_lex_state(state); }); - _default([&]() { line("LEX_ERROR();"); }); + } + _default([&]() { line("return false;"); }); }); }); line("}"); @@ -396,18 +397,18 @@ class CCodeGenerator { } void add_lex_state(const LexState &lex_state) { - if (lex_state.is_token_start) - line("START_TOKEN();"); + if (lex_state.accept_action.is_present()) { + add_accept_token_action(lex_state.accept_action); + } - for (const auto &pair : lex_state.advance_actions) - if (!pair.first.is_empty()) + for (const auto &pair : lex_state.advance_actions) { + if (!pair.first.is_empty()) { _if([&]() { add_character_set_condition(pair.first); }, [&]() { add_advance_action(pair.second); }); + } + } - if (lex_state.accept_action.is_present()) - add_accept_token_action(lex_state.accept_action); - else - line("LEX_ERROR();"); + line("END_STATE();"); } void add_character_set_condition(const rules::CharacterSet &rule) { @@ -428,8 +429,7 @@ class CCodeGenerator { for (const auto &range : ranges) { if (!first) { add(" ||"); - line(); - add_padding(); + line(" "); } add("("); @@ -442,20 +442,20 @@ class CCodeGenerator { } void add_character_range_condition(const rules::CharacterRange &range) { - string lookahead("lookahead"); if (range.min == range.max) { - add(lookahead + " == " + escape_char(range.min)); + add("lookahead == " + escape_char(range.min)); } else { - add(escape_char(range.min) + string(" <= ") + lookahead + " && " + - lookahead + " <= " + escape_char(range.max)); + add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") + + escape_char(range.max)); } } void add_advance_action(const AdvanceAction &action) { - if (action.in_main_token) + if (action.in_main_token) { line("ADVANCE(" + to_string(action.state_index) + ");"); - else + } else { line("SKIP(" + to_string(action.state_index) + ");"); + } } void add_accept_token_action(const AcceptTokenAction &action) { @@ -669,7 +669,7 @@ class CCodeGenerator { void add_padding() { for (size_t i = 0; i < indent_level; i++) - add(" "); + add(" "); } void indent(function body) { diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc index 74af0900..ccca250d 100644 --- a/src/compiler/lex_table.cc +++ b/src/compiler/lex_table.cc @@ -42,12 +42,9 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const { (is_string == other.is_string); } -LexState::LexState() : is_token_start(false) {} - bool LexState::operator==(const LexState &other) const { return advance_actions == other.advance_actions && - accept_action == other.accept_action && - is_token_start == other.is_token_start; + accept_action == other.accept_action; } } // namespace tree_sitter diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index e669739e..7b87079a 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -35,12 +35,10 @@ struct AcceptTokenAction { }; struct LexState { - LexState(); bool operator==(const LexState &) const; std::map advance_actions; AcceptTokenAction accept_action; - bool is_token_start; }; struct LexTable { diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 902c2d3b..123a29fd 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -16,6 +16,8 @@ static const char empty_chunk[2] = { 0, 0 }; +static Length unknown_length = {UINT32_MAX, 0, {0, 0}}; + static void ts_lexer__get_chunk(Lexer *self) { TSInput input = self->input; if (!self->chunk || @@ -70,6 +72,11 @@ static void ts_lexer__advance(void *payload, bool skip) { ts_lexer__get_lookahead(self); } +static void ts_lexer__mark_end(void *payload) { + Lexer *self = (Lexer *)payload; + self->token_end_position = self->current_position; +} + /* * The lexer's advance method is stored as a struct field so that generated * parsers can call it without needing to be linked against this library. @@ -79,6 +86,7 @@ void ts_lexer_init(Lexer *self) { *self = (Lexer){ .data = { .advance = ts_lexer__advance, + .mark_end = ts_lexer__mark_end, .lookahead = 0, .result_symbol = 0, }, @@ -95,6 +103,7 @@ void ts_lexer_init(Lexer *self) { static inline void ts_lexer__reset(Lexer *self, Length position) { self->token_start_position = position; + self->token_end_position = unknown_length; self->current_position = position; if (self->chunk && (position.bytes < self->chunk_start || @@ -122,6 +131,7 @@ void ts_lexer_reset(Lexer *self, Length position) { void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; + self->token_end_position = unknown_length; self->data.result_symbol = 0; if (!self->chunk) diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 67470f6f..0cf6c252 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -15,11 +15,11 @@ typedef struct { TSLexer data; Length current_position; Length token_start_position; + Length token_end_position; const char *chunk; uint32_t chunk_start; uint32_t chunk_size; - uint32_t lookahead_size; TSInput input; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 09fa6b13..af65c7ea 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -279,7 +279,6 @@ static Tree *parser__lex(Parser *self, StackVersion version) { if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); - ts_lexer_reset(&self->lexer, error_end_position); result = ts_tree_make_error(size, padding, first_error_character); } else { TSSymbol symbol = self->lexer.data.result_symbol; @@ -287,8 +286,11 @@ static Tree *parser__lex(Parser *self, StackVersion version) { symbol = self->language->external_scanner.symbol_map[symbol]; } + if (length_has_unknown_chars(self->lexer.token_end_position)) { + self->lexer.token_end_position = self->lexer.current_position; + } Length padding = length_sub(self->lexer.token_start_position, start_position); - Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position); + Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol); result = ts_tree_make_leaf(symbol, padding, size, metadata); @@ -301,6 +303,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { } } + result->bytes_scanned = self->lexer.current_position.bytes - start_position.bytes + 1; result->parse_state = parse_state; result->first_leaf.lex_mode = lex_mode; diff --git a/src/runtime/tree.c b/src/runtime/tree.c index ac9f42f4..49f81e9c 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -155,7 +155,10 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { if (i == 0) { self->padding = child->padding; self->size = child->size; + self->bytes_scanned = child->bytes_scanned; } else { + uint32_t bytes_scanned = ts_tree_total_bytes(self) + child->bytes_scanned; + if (bytes_scanned > self->bytes_scanned) self->bytes_scanned = bytes_scanned; self->size = length_add(self->size, ts_tree_total_size(child)); } @@ -344,6 +347,21 @@ static inline long min(long a, long b) { return a <= b ? a : b; } +bool ts_tree_invalidate_lookahead(Tree *self, uint32_t edit_byte_offset) { + if (edit_byte_offset >= self->bytes_scanned) return false; + self->has_changes = true; + if (self->child_count > 0) { + uint32_t child_start_byte = 0; + for (uint32_t i = 0; i < self->child_count; i++) { + Tree *child = self->children[i]; + if (child_start_byte > edit_byte_offset) break; + ts_tree_invalidate_lookahead(child, edit_byte_offset - child_start_byte); + child_start_byte += ts_tree_total_bytes(child); + } + } + return true; +} + void ts_tree_edit(Tree *self, const TSInputEdit *edit) { uint32_t old_end_byte = edit->start_byte + edit->bytes_removed; @@ -390,29 +408,27 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) { for (uint32_t i = 0; i < self->child_count; i++) { Tree *child = self->children[i]; child_left = child_right; + child_right = length_add(child_left, ts_tree_total_size(child)); - if (!found_first_child) { - child_right = length_add(child_left, ts_tree_total_size(child)); - if (child_right.bytes >= edit->start_byte) { - found_first_child = true; - TSInputEdit child_edit = { - .start_byte = edit->start_byte - child_left.bytes, - .bytes_added = edit->bytes_added, - .bytes_removed = edit->bytes_removed, - .start_point = point_sub(edit->start_point, child_left.extent), - .extent_added = edit->extent_added, - .extent_removed = edit->extent_removed, - }; + if (!found_first_child && child_right.bytes >= edit->start_byte) { + found_first_child = true; + TSInputEdit child_edit = { + .start_byte = edit->start_byte - child_left.bytes, + .bytes_added = edit->bytes_added, + .bytes_removed = edit->bytes_removed, + .start_point = point_sub(edit->start_point, child_left.extent), + .extent_added = edit->extent_added, + .extent_removed = edit->extent_removed, + }; - if (old_end_byte > child_right.bytes) { - child_edit.bytes_removed = child_right.bytes - edit->start_byte; - child_edit.extent_removed = point_sub(child_right.extent, edit->start_point); - remaining_bytes_to_delete = old_end_byte - child_right.bytes; - remaining_extent_to_delete = point_sub(old_end_point, child_right.extent); - } - - ts_tree_edit(child, &child_edit); + if (old_end_byte > child_right.bytes) { + child_edit.bytes_removed = child_right.bytes - edit->start_byte; + child_edit.extent_removed = point_sub(child_right.extent, edit->start_point); + remaining_bytes_to_delete = old_end_byte - child_right.bytes; + remaining_extent_to_delete = point_sub(old_end_point, child_right.extent); } + + ts_tree_edit(child, &child_edit); } else if (remaining_bytes_to_delete > 0) { TSInputEdit child_edit = { .start_byte = 0, @@ -425,6 +441,8 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) { remaining_bytes_to_delete -= child_edit.bytes_removed; remaining_extent_to_delete = point_sub(remaining_extent_to_delete, child_edit.extent_removed); ts_tree_edit(child, &child_edit); + } else { + ts_tree_invalidate_lookahead(child, edit->start_byte - child_left.bytes); } child_right = length_add(child_left, ts_tree_total_size(child)); diff --git a/src/runtime/tree.h b/src/runtime/tree.h index de88a913..c08ba24b 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -34,6 +34,7 @@ typedef struct Tree { Length padding; Length size; + uint32_t bytes_scanned; TSSymbol symbol; TSStateId parse_state; diff --git a/test/compiler/build_tables/lex_conflict_manager_test.cc b/test/compiler/build_tables/lex_conflict_manager_test.cc index e360d78a..ca05a32c 100644 --- a/test/compiler/build_tables/lex_conflict_manager_test.cc +++ b/test/compiler/build_tables/lex_conflict_manager_test.cc @@ -69,18 +69,15 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { describe("advance/accept-token conflicts", [&]() { describe("when the token to accept has higher precedence", [&]() { it("prefers the accept-token action", [&]() { - AssertThat(conflict_manager.possible_extensions, IsEmpty()); update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); AssertThat(update, IsFalse()); - AssertThat(conflict_manager.possible_extensions, IsEmpty()); }); }); describe("when the token to accept does not have a higher precedence", [&]() { - it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() { + it("favors the advance action", [&]() { update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true)); AssertThat(update, IsTrue()); - AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index)); }); }); }); diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 0052c4ab..7dfcf26b 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -164,7 +164,7 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { ts_document_set_language(document, load_real_language("javascript")); - set_text("a; /* b"); + set_text("a; ' this string never ends"); assert_root_node( "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); });