From 0e3d9c2c58c689d25f3b027dc0a026aba8bd5e34 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 7 Nov 2018 12:56:44 -0800 Subject: [PATCH] Handle changes in included ranges when parsing incrementally --- src/runtime/get_changed_ranges.c | 116 +++++++++++++++++++--- src/runtime/get_changed_ranges.h | 25 ++++- src/runtime/length.h | 1 + src/runtime/parser.c | 61 +++++++++++- src/runtime/point.h | 2 + src/runtime/tree.c | 49 +++++++++- src/runtime/tree.h | 4 +- test/runtime/parser_test.cc | 159 +++++++++++++++++++++++++++++++ 8 files changed, 396 insertions(+), 21 deletions(-) diff --git a/src/runtime/get_changed_ranges.c b/src/runtime/get_changed_ranges.c index cb531b2d..578f0059 100644 --- a/src/runtime/get_changed_ranges.c +++ b/src/runtime/get_changed_ranges.c @@ -7,11 +7,9 @@ // #define DEBUG_GET_CHANGED_RANGES -typedef Array(TSRange) RangeArray; - -static void range_array_add(RangeArray *results, Length start, Length end) { - if (results->size > 0) { - TSRange *last_range = array_back(results); +static void ts_range_array_add(TSRangeArray *self, Length start, Length end) { + if (self->size > 0) { + TSRange *last_range = array_back(self); if (start.bytes <= last_range->end_byte) { last_range->end_byte = end.bytes; last_range->end_point = end.extent; @@ -21,7 +19,79 @@ static void range_array_add(RangeArray *results, Length start, Length end) { if (start.bytes < end.bytes) { TSRange range = { start.extent, end.extent, start.bytes, end.bytes }; - array_push(results, range); + array_push(self, range); + } +} + +bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index, + uint32_t start_byte, uint32_t end_byte) { + for (unsigned i = start_index; i < self->size; i++) { + TSRange *range = &self->contents[i]; + if (range->end_byte > start_byte) { + if (range->start_byte >= end_byte) break; + return true; + } + } + return false; +} + +void ts_range_array_get_changed_ranges( + const TSRange *old_ranges, unsigned old_range_count, + const TSRange *new_ranges, unsigned new_range_count, + TSRangeArray *differences +) { + unsigned new_index = 0; + unsigned old_index = 0; + Length current_position = length_zero(); + bool in_old_range = false; + bool in_new_range = false; + + while (old_index < old_range_count || new_index < new_range_count) { + const TSRange *old_range = &old_ranges[old_index]; + const TSRange *new_range = &new_ranges[new_index]; + + Length next_old_position; + if (in_old_range) { + next_old_position = (Length) {old_range->end_byte, old_range->end_point}; + } else if (old_index < old_range_count) { + next_old_position = (Length) {old_range->start_byte, old_range->start_point}; + } else { + next_old_position = LENGTH_MAX; + } + + Length next_new_position; + if (in_new_range) { + next_new_position = (Length) {new_range->end_byte, new_range->end_point}; + } else if (new_index < new_range_count) { + next_new_position = (Length) {new_range->start_byte, new_range->start_point}; + } else { + next_new_position = LENGTH_MAX; + } + + if (next_old_position.bytes < next_new_position.bytes) { + if (in_old_range != in_new_range) { + ts_range_array_add(differences, current_position, next_old_position); + } + if (in_old_range) old_index++; + current_position = next_old_position; + in_old_range = !in_old_range; + } else if (next_new_position.bytes < next_old_position.bytes) { + if (in_old_range != in_new_range) { + ts_range_array_add(differences, current_position, next_new_position); + } + if (in_new_range) new_index++; + current_position = next_new_position; + in_new_range = !in_new_range; + } else { + if (in_old_range != in_new_range) { + ts_range_array_add(differences, current_position, next_new_position); + } + if (in_old_range) old_index++; + if (in_new_range) new_index++; + in_old_range = !in_old_range; + in_new_range = !in_new_range; + current_position = next_new_position; + } } } @@ -267,19 +337,23 @@ static inline void iterator_print_state(Iterator *self) { unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree, TreeCursor *cursor1, TreeCursor *cursor2, - const TSLanguage *language, TSRange **ranges) { - RangeArray results = array_new(); + const TSLanguage *language, + const TSRangeArray *included_range_differences, + TSRange **ranges) { + TSRangeArray results = array_new(); Iterator old_iter = iterator_new(cursor1, old_tree, language); Iterator new_iter = iterator_new(cursor2, new_tree, language); + unsigned included_range_difference_index = 0; + Length position = iterator_start_position(&old_iter); Length next_position = iterator_start_position(&new_iter); if (position.bytes < next_position.bytes) { - range_array_add(&results, position, next_position); + ts_range_array_add(&results, position, next_position); position = next_position; } else if (position.bytes > next_position.bytes) { - range_array_add(&results, next_position, position); + ts_range_array_add(&results, next_position, position); next_position = position; } @@ -296,7 +370,16 @@ unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *n switch (iterator_compare(&old_iter, &new_iter)) { case IteratorMatches: next_position = iterator_end_position(&old_iter); - break; + if (ts_range_array_intersects( + included_range_differences, + included_range_difference_index, + position.bytes, next_position.bytes + )) { + next_position = position; + // fall through + } else { + break; + } case IteratorMayDiffer: if (iterator_descend(&old_iter, position.bytes)) { @@ -347,10 +430,19 @@ unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *n ); #endif - range_array_add(&results, position, next_position); + ts_range_array_add(&results, position, next_position); } position = next_position; + + while (included_range_difference_index < included_range_differences->size) { + const TSRange *range = &included_range_differences->contents[included_range_difference_index]; + if (range->end_byte <= position.bytes) { + included_range_difference_index++; + } else { + break; + } + } } while (!iterator_done(&old_iter) && !iterator_done(&new_iter)); *cursor1 = old_iter.cursor; diff --git a/src/runtime/get_changed_ranges.h b/src/runtime/get_changed_ranges.h index 9daeb919..e7fcead1 100644 --- a/src/runtime/get_changed_ranges.h +++ b/src/runtime/get_changed_ranges.h @@ -1,13 +1,36 @@ #ifndef RUNTIME_GET_CHANGED_RANGES_H_ #define RUNTIME_GET_CHANGED_RANGES_H_ +#ifdef __cplusplus +extern "C" { +#endif + #include "runtime/tree_cursor.h" #include "runtime/subtree.h" +typedef Array(TSRange) TSRangeArray; + +void ts_range_array_get_changed_ranges( + const TSRange *old_ranges, unsigned old_range_count, + const TSRange *new_ranges, unsigned new_range_count, + TSRangeArray *differences +); + +bool ts_range_array_intersects( + const TSRangeArray *self, unsigned start_index, + uint32_t start_byte, uint32_t end_byte +); + unsigned ts_subtree_get_changed_ranges( const Subtree *old_tree, const Subtree *new_tree, TreeCursor *cursor1, TreeCursor *cursor2, - const TSLanguage *language, TSRange **ranges + const TSLanguage *language, + const TSRangeArray *included_range_differences, + TSRange **ranges ); +#ifdef __cplusplus +} +#endif + #endif // RUNTIME_GET_CHANGED_RANGES_H_ diff --git a/src/runtime/length.h b/src/runtime/length.h index 2afbc42b..8dd1715e 100644 --- a/src/runtime/length.h +++ b/src/runtime/length.h @@ -12,6 +12,7 @@ typedef struct { } Length; static const Length LENGTH_UNDEFINED = {0, {0, 1}}; +static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}}; static inline bool length_is_undefined(Length length) { return length.bytes == 0 && length.extent.column != 0; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 8af62ea3..c4fbfe5c 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -13,6 +13,7 @@ #include "runtime/reusable_node.h" #include "runtime/reduce_action.h" #include "runtime/error_costs.h" +#include "runtime/get_changed_ranges.h" #include "runtime/tree.h" #define LOG(...) \ @@ -66,6 +67,8 @@ struct TSParser { volatile bool enabled; bool halt_on_error; Subtree old_tree; + TSRangeArray included_range_differences; + unsigned included_range_difference_index; }; typedef struct { @@ -398,6 +401,8 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa last_byte_scanned = self->lexer.current_position.bytes; } + if (self->lexer.data.lookahead != 0) last_byte_scanned++; + Subtree result; if (skipped_error) { Length padding = length_sub(error_start_position, start_position); @@ -407,7 +412,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa first_error_character, padding, size, - last_byte_scanned + 1 - error_end_position.bytes, + last_byte_scanned - error_end_position.bytes, parse_state, self->language ); @@ -442,7 +447,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa symbol, padding, size, - last_byte_scanned + 1 - self->lexer.token_end_position.bytes, + last_byte_scanned - self->lexer.token_end_position.bytes, parse_state, found_external_token, is_keyword, @@ -500,12 +505,25 @@ static void ts_parser__set_cached_token(TSParser *self, size_t byte_index, cache->last_external_token = last_external_token; } +static bool ts_parser__has_included_range_difference(const TSParser *self, + uint32_t start_position, + uint32_t end_position) { + return ts_range_array_intersects( + &self->included_range_differences, + self->included_range_difference_index, + start_position, + end_position + ); +} + static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version, TSStateId *state, uint32_t position, Subtree last_external_token, TableEntry *table_entry) { Subtree result; while ((result = reusable_node_tree(&self->reusable_node)).ptr) { uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); + uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); + if (byte_offset > position) { LOG("before_reusable_node symbol:%s", TREE_NAME(result)); break; @@ -513,7 +531,9 @@ static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version, if (byte_offset < position) { LOG("past_reusable_node symbol:%s", TREE_NAME(result)); - reusable_node_advance(&self->reusable_node); + if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node)) { + reusable_node_advance(&self->reusable_node); + } continue; } @@ -532,6 +552,8 @@ static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version, reason = "is_missing"; } else if (ts_subtree_is_fragile(result)) { reason = "is_fragile"; + } else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset)) { + reason = "contains_different_included_range"; } if (reason) { @@ -1418,6 +1440,8 @@ TSParser *ts_parser_new() { self->operation_limit = SIZE_MAX; self->old_tree = NULL_SUBTREE; self->scratch_tree.ptr = &self->scratch_tree_data; + self->included_range_differences = (TSRangeArray) array_new(); + self->included_range_difference_index = 0; ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); return self; } @@ -1427,6 +1451,9 @@ void ts_parser_delete(TSParser *self) { if (self->reduce_actions.contents) { array_delete(&self->reduce_actions); } + if (self->included_range_differences.contents) { + array_delete(&self->included_range_differences); + } if (self->old_tree.ptr) { ts_subtree_release(&self->tree_pool, self->old_tree); self->old_tree = NULL_SUBTREE; @@ -1526,13 +1553,25 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) { ts_lexer_set_input(&self->lexer, input); + array_clear(&self->included_range_differences); + self->included_range_difference_index = 0; + if (ts_parser_has_outstanding_parse(self)) { LOG("resume_parsing"); } else if (old_tree) { ts_subtree_retain(old_tree->root); self->old_tree = old_tree->root; + ts_range_array_get_changed_ranges( + old_tree->included_ranges, old_tree->included_range_count, + self->lexer.included_ranges, self->lexer.included_range_count, + &self->included_range_differences + ); reusable_node_reset(&self->reusable_node, old_tree->root); LOG("parse_after_edit"); + for (unsigned i = 0; i < self->included_range_differences.size; i++) { + TSRange *range = &self->included_range_differences.contents[i]; + LOG("different_included_range %u - %u", range->start_byte, range->end_byte); + } } else { reusable_node_clear(&self->reusable_node); LOG("new_parse"); @@ -1574,13 +1613,27 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) { ts_parser__halt_parse(self); break; } + + while (self->included_range_difference_index < self->included_range_differences.size) { + TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index]; + if (range->end_byte <= position) { + self->included_range_difference_index++; + } else { + break; + } + } } while (version_count != 0); ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); LOG("done"); LOG_TREE(); - TSTree *result = ts_tree_new(self->finished_tree, self->language); + TSTree *result = ts_tree_new( + self->finished_tree, + self->language, + self->lexer.included_ranges, + self->lexer.included_range_count + ); self->finished_tree = NULL_SUBTREE; ts_parser_reset(self); return result; diff --git a/src/runtime/point.h b/src/runtime/point.h index f431438c..0c4941d5 100644 --- a/src/runtime/point.h +++ b/src/runtime/point.h @@ -3,6 +3,8 @@ #include "tree_sitter/runtime.h" +#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) + static inline TSPoint point__new(unsigned row, unsigned column) { TSPoint result = {row, column}; return result; diff --git a/src/runtime/tree.c b/src/runtime/tree.c index 9943b799..75680e91 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -7,25 +7,32 @@ static const unsigned PARENT_CACHE_CAPACITY = 32; -TSTree *ts_tree_new(Subtree root, const TSLanguage *language) { +TSTree *ts_tree_new( + Subtree root, const TSLanguage *language, + const TSRange *included_ranges, unsigned included_range_count +) { TSTree *result = ts_malloc(sizeof(TSTree)); result->root = root; result->language = language; result->parent_cache = NULL; result->parent_cache_start = 0; result->parent_cache_size = 0; + result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange)); + memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange)); + result->included_range_count = included_range_count; return result; } TSTree *ts_tree_copy(const TSTree *self) { ts_subtree_retain(self->root); - return ts_tree_new(self->root, self->language); + return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count); } void ts_tree_delete(TSTree *self) { SubtreePool pool = ts_subtree_pool_new(0); ts_subtree_release(&pool, self->root); ts_subtree_pool_delete(&pool); + ts_free(self->included_ranges); if (self->parent_cache) ts_free(self->parent_cache); ts_free(self); } @@ -39,6 +46,32 @@ const TSLanguage *ts_tree_language(const TSTree *self) { } void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { + for (unsigned i = 0; i < self->included_range_count; i++) { + TSRange *range = &self->included_ranges[i]; + if (range->end_byte >= edit->old_end_byte) { + range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte); + range->end_point = point_add( + edit->new_end_point, + point_sub(range->end_point, edit->old_end_point) + ); + if (range->end_byte < edit->new_end_byte) { + range->end_byte = UINT32_MAX; + range->end_point = POINT_MAX; + } + if (range->start_byte >= edit->old_end_byte) { + range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); + range->start_point = point_add( + edit->new_end_point, + point_sub(range->start_point, edit->old_end_point) + ); + if (range->start_byte < edit->new_end_byte) { + range->start_byte = UINT32_MAX; + range->start_point = POINT_MAX; + } + } + } + } + SubtreePool pool = ts_subtree_pool_new(0); self->root = ts_subtree_edit(self->root, edit, &pool); self->parent_cache_start = 0; @@ -53,10 +86,20 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin TSNode root = ts_tree_root_node(self); ts_tree_cursor_init(&cursor1, root); ts_tree_cursor_init(&cursor2, root); + + TSRangeArray included_range_differences = array_new(); + ts_range_array_get_changed_ranges( + self->included_ranges, self->included_range_count, + other->included_ranges, other->included_range_count, + &included_range_differences + ); + *count = ts_subtree_get_changed_ranges( &self->root, &other->root, &cursor1, &cursor2, - self->language, &result + self->language, &included_range_differences, &result ); + + array_delete(&included_range_differences); array_delete(&cursor1.stack); array_delete(&cursor2.stack); return result; diff --git a/src/runtime/tree.h b/src/runtime/tree.h index 5a7e6547..dd4f3184 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -18,9 +18,11 @@ struct TSTree { ParentCacheEntry *parent_cache; uint32_t parent_cache_start; uint32_t parent_cache_size; + TSRange *included_ranges; + unsigned included_range_count; }; -TSTree *ts_tree_new(Subtree root, const TSLanguage *language); +TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); TSNode ts_tree_get_cached_parent(const TSTree *, const TSNode *); void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *); diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7c87bd02..2d154637 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -2,6 +2,7 @@ #include #include "runtime/alloc.h" #include "runtime/language.h" +#include "runtime/get_changed_ranges.h" #include "helpers/record_alloc.h" #include "helpers/spy_input.h" #include "helpers/load_language.h" @@ -1002,6 +1003,164 @@ describe("Parser", [&]() { AssertThat(ts_node_end_point(statement_node1), Equals(extent_for_string("a <%= b()"))); AssertThat(ts_node_end_point(statement_node2), Equals(extent_for_string("a <%= b() %> c <% d()"))); }); + + it("does not reuse nodes that were parsed in ranges that are now excluded", [&]() { + string source_code = "
<%= something %>
"; + + // Parse HTML including the template directive, which will cause an error + ts_parser_set_language(parser, load_real_language("html")); + TSTree *first_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); + + // Insert code at the beginning of the document. + string prefix = "a very very long line of plain text. "; + unsigned prefix_length = prefix.size(); + TSInputEdit edit = { + 0, 0, prefix_length, + {0, 0}, {0, 0}, {0, prefix_length} + }; + ts_tree_edit(first_tree, &edit); + source_code = prefix + source_code; + + // Parse the HTML again, this time *excluding* the template directive + // (which has moved since the previous parse). + unsigned directive_start = source_code.find("<%="); + unsigned directive_end = source_code.find(""); + unsigned source_code_end = source_code.size(); + + TSRange included_ranges[] = { + { + {0, 0}, + {0, directive_start}, + 0, + directive_start + }, + { + {0, directive_end}, + {0, source_code_end}, + directive_end, + source_code_end + } + }; + + ts_parser_set_included_ranges(parser, included_ranges, 2); + tree = ts_parser_parse_string(parser, first_tree, source_code.c_str(), source_code.size()); + + // The error should not have been reused, because the included ranges were different. + assert_root_node("(fragment " + "(text) " + "(element " + "(start_tag (tag_name)) " + "(element " + "(start_tag (tag_name)) " + "(end_tag (tag_name))) " + "(end_tag (tag_name))))"); + + unsigned range_count; + const TSRange *ranges = ts_tree_get_changed_ranges(first_tree, tree, &range_count); + + // The first range that's changed syntax is the range of the + // newly-inserted text. + AssertThat(range_count, Equals(2u)); + AssertThat(ranges[0], Equals({ + {0, 0}, {0, prefix_length}, + 0, prefix_length, + })); + + // Even though no edits were applied to the outer `div` element, + // its contents have changed syntax because a range of text that + // was previously included is now excluded. + AssertThat(ranges[1], Equals({ + {0, directive_start}, {0, directive_end}, + directive_start, directive_end, + })); + + ts_free((void *)ranges); + ts_tree_delete(first_tree); + }); + }); + + describe("ts_range_array_get_changed_ranges()", [&]() { + auto get_changed_ranges = [&]( + const vector &old_ranges, + const vector &new_ranges + ) { + TSRangeArray result = array_new(); + ts_range_array_get_changed_ranges( + old_ranges.data(), old_ranges.size(), + new_ranges.data(), new_ranges.size(), + &result + ); + vector result_vector; + for (unsigned i = 0; i < result.size; i++) { + result_vector.push_back(result.contents[i]); + } + array_delete(&result); + return result_vector; + }; + + auto range = [&](unsigned start, unsigned end) { + TSRange result; + result.start_byte = start; + result.end_byte = end; + result.start_point = {0, start}; + if (end == UINT32_MAX) { + result.end_point = {UINT32_MAX, UINT32_MAX}; + } else { + result.end_point = {0, end}; + } + return result; + }; + + it("returns an array of ranges that are newly included excluded", [&]() { + AssertThat(get_changed_ranges( + { + range(0, UINT32_MAX), + }, + { + range(0, 5), + range(8, UINT32_MAX), + } + ), Equals>( + { + range(5, 8) + } + )); + + AssertThat(get_changed_ranges( + { + range(0, 3), + range(7, 10), + range(13, 30), + }, + { + range(0, 4), + range(8, 11), + range(14, 30), + } + ), Equals>( + { + range(3, 4), + range(7, 8), + range(10, 11), + range(13, 14), + } + )); + + AssertThat(get_changed_ranges( + { + range(0, UINT32_MAX), + }, + { + range(0, 4), + range(5, 64), + } + ), Equals>( + { + range(4, 5), + range(64, UINT32_MAX), + } + )); + }); }); });