Handle changes in included ranges when parsing incrementally

This commit is contained in:
Max Brunsfeld 2018-11-07 12:56:44 -08:00
parent 93a7395c19
commit 0e3d9c2c58
8 changed files with 396 additions and 21 deletions

View file

@ -7,11 +7,9 @@
// #define DEBUG_GET_CHANGED_RANGES
typedef Array(TSRange) RangeArray;
static void range_array_add(RangeArray *results, Length start, Length end) {
if (results->size > 0) {
TSRange *last_range = array_back(results);
static void ts_range_array_add(TSRangeArray *self, Length start, Length end) {
if (self->size > 0) {
TSRange *last_range = array_back(self);
if (start.bytes <= last_range->end_byte) {
last_range->end_byte = end.bytes;
last_range->end_point = end.extent;
@ -21,7 +19,79 @@ static void range_array_add(RangeArray *results, Length start, Length end) {
if (start.bytes < end.bytes) {
TSRange range = { start.extent, end.extent, start.bytes, end.bytes };
array_push(results, range);
array_push(self, range);
}
}
bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index,
uint32_t start_byte, uint32_t end_byte) {
for (unsigned i = start_index; i < self->size; i++) {
TSRange *range = &self->contents[i];
if (range->end_byte > start_byte) {
if (range->start_byte >= end_byte) break;
return true;
}
}
return false;
}
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
) {
unsigned new_index = 0;
unsigned old_index = 0;
Length current_position = length_zero();
bool in_old_range = false;
bool in_new_range = false;
while (old_index < old_range_count || new_index < new_range_count) {
const TSRange *old_range = &old_ranges[old_index];
const TSRange *new_range = &new_ranges[new_index];
Length next_old_position;
if (in_old_range) {
next_old_position = (Length) {old_range->end_byte, old_range->end_point};
} else if (old_index < old_range_count) {
next_old_position = (Length) {old_range->start_byte, old_range->start_point};
} else {
next_old_position = LENGTH_MAX;
}
Length next_new_position;
if (in_new_range) {
next_new_position = (Length) {new_range->end_byte, new_range->end_point};
} else if (new_index < new_range_count) {
next_new_position = (Length) {new_range->start_byte, new_range->start_point};
} else {
next_new_position = LENGTH_MAX;
}
if (next_old_position.bytes < next_new_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_old_position);
}
if (in_old_range) old_index++;
current_position = next_old_position;
in_old_range = !in_old_range;
} else if (next_new_position.bytes < next_old_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_new_range) new_index++;
current_position = next_new_position;
in_new_range = !in_new_range;
} else {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_old_range) old_index++;
if (in_new_range) new_index++;
in_old_range = !in_old_range;
in_new_range = !in_new_range;
current_position = next_new_position;
}
}
}
@ -267,19 +337,23 @@ static inline void iterator_print_state(Iterator *self) {
unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language, TSRange **ranges) {
RangeArray results = array_new();
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges) {
TSRangeArray results = array_new();
Iterator old_iter = iterator_new(cursor1, old_tree, language);
Iterator new_iter = iterator_new(cursor2, new_tree, language);
unsigned included_range_difference_index = 0;
Length position = iterator_start_position(&old_iter);
Length next_position = iterator_start_position(&new_iter);
if (position.bytes < next_position.bytes) {
range_array_add(&results, position, next_position);
ts_range_array_add(&results, position, next_position);
position = next_position;
} else if (position.bytes > next_position.bytes) {
range_array_add(&results, next_position, position);
ts_range_array_add(&results, next_position, position);
next_position = position;
}
@ -296,7 +370,16 @@ unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *n
switch (iterator_compare(&old_iter, &new_iter)) {
case IteratorMatches:
next_position = iterator_end_position(&old_iter);
break;
if (ts_range_array_intersects(
included_range_differences,
included_range_difference_index,
position.bytes, next_position.bytes
)) {
next_position = position;
// fall through
} else {
break;
}
case IteratorMayDiffer:
if (iterator_descend(&old_iter, position.bytes)) {
@ -347,10 +430,19 @@ unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *n
);
#endif
range_array_add(&results, position, next_position);
ts_range_array_add(&results, position, next_position);
}
position = next_position;
while (included_range_difference_index < included_range_differences->size) {
const TSRange *range = &included_range_differences->contents[included_range_difference_index];
if (range->end_byte <= position.bytes) {
included_range_difference_index++;
} else {
break;
}
}
} while (!iterator_done(&old_iter) && !iterator_done(&new_iter));
*cursor1 = old_iter.cursor;

View file

@ -1,13 +1,36 @@
#ifndef RUNTIME_GET_CHANGED_RANGES_H_
#define RUNTIME_GET_CHANGED_RANGES_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "runtime/tree_cursor.h"
#include "runtime/subtree.h"
typedef Array(TSRange) TSRangeArray;
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
);
bool ts_range_array_intersects(
const TSRangeArray *self, unsigned start_index,
uint32_t start_byte, uint32_t end_byte
);
unsigned ts_subtree_get_changed_ranges(
const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language, TSRange **ranges
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges
);
#ifdef __cplusplus
}
#endif
#endif // RUNTIME_GET_CHANGED_RANGES_H_

View file

@ -12,6 +12,7 @@ typedef struct {
} Length;
static const Length LENGTH_UNDEFINED = {0, {0, 1}};
static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}};
static inline bool length_is_undefined(Length length) {
return length.bytes == 0 && length.extent.column != 0;

View file

@ -13,6 +13,7 @@
#include "runtime/reusable_node.h"
#include "runtime/reduce_action.h"
#include "runtime/error_costs.h"
#include "runtime/get_changed_ranges.h"
#include "runtime/tree.h"
#define LOG(...) \
@ -66,6 +67,8 @@ struct TSParser {
volatile bool enabled;
bool halt_on_error;
Subtree old_tree;
TSRangeArray included_range_differences;
unsigned included_range_difference_index;
};
typedef struct {
@ -398,6 +401,8 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
last_byte_scanned = self->lexer.current_position.bytes;
}
if (self->lexer.data.lookahead != 0) last_byte_scanned++;
Subtree result;
if (skipped_error) {
Length padding = length_sub(error_start_position, start_position);
@ -407,7 +412,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
first_error_character,
padding,
size,
last_byte_scanned + 1 - error_end_position.bytes,
last_byte_scanned - error_end_position.bytes,
parse_state,
self->language
);
@ -442,7 +447,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
symbol,
padding,
size,
last_byte_scanned + 1 - self->lexer.token_end_position.bytes,
last_byte_scanned - self->lexer.token_end_position.bytes,
parse_state,
found_external_token,
is_keyword,
@ -500,12 +505,25 @@ static void ts_parser__set_cached_token(TSParser *self, size_t byte_index,
cache->last_external_token = last_external_token;
}
static bool ts_parser__has_included_range_difference(const TSParser *self,
uint32_t start_position,
uint32_t end_position) {
return ts_range_array_intersects(
&self->included_range_differences,
self->included_range_difference_index,
start_position,
end_position
);
}
static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version,
TSStateId *state, uint32_t position,
Subtree last_external_token, TableEntry *table_entry) {
Subtree result;
while ((result = reusable_node_tree(&self->reusable_node)).ptr) {
uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node);
uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result);
if (byte_offset > position) {
LOG("before_reusable_node symbol:%s", TREE_NAME(result));
break;
@ -513,7 +531,9 @@ static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version,
if (byte_offset < position) {
LOG("past_reusable_node symbol:%s", TREE_NAME(result));
reusable_node_advance(&self->reusable_node);
if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node)) {
reusable_node_advance(&self->reusable_node);
}
continue;
}
@ -532,6 +552,8 @@ static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version,
reason = "is_missing";
} else if (ts_subtree_is_fragile(result)) {
reason = "is_fragile";
} else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset)) {
reason = "contains_different_included_range";
}
if (reason) {
@ -1418,6 +1440,8 @@ TSParser *ts_parser_new() {
self->operation_limit = SIZE_MAX;
self->old_tree = NULL_SUBTREE;
self->scratch_tree.ptr = &self->scratch_tree_data;
self->included_range_differences = (TSRangeArray) array_new();
self->included_range_difference_index = 0;
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
return self;
}
@ -1427,6 +1451,9 @@ void ts_parser_delete(TSParser *self) {
if (self->reduce_actions.contents) {
array_delete(&self->reduce_actions);
}
if (self->included_range_differences.contents) {
array_delete(&self->included_range_differences);
}
if (self->old_tree.ptr) {
ts_subtree_release(&self->tree_pool, self->old_tree);
self->old_tree = NULL_SUBTREE;
@ -1526,13 +1553,25 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) {
ts_lexer_set_input(&self->lexer, input);
array_clear(&self->included_range_differences);
self->included_range_difference_index = 0;
if (ts_parser_has_outstanding_parse(self)) {
LOG("resume_parsing");
} else if (old_tree) {
ts_subtree_retain(old_tree->root);
self->old_tree = old_tree->root;
ts_range_array_get_changed_ranges(
old_tree->included_ranges, old_tree->included_range_count,
self->lexer.included_ranges, self->lexer.included_range_count,
&self->included_range_differences
);
reusable_node_reset(&self->reusable_node, old_tree->root);
LOG("parse_after_edit");
for (unsigned i = 0; i < self->included_range_differences.size; i++) {
TSRange *range = &self->included_range_differences.contents[i];
LOG("different_included_range %u - %u", range->start_byte, range->end_byte);
}
} else {
reusable_node_clear(&self->reusable_node);
LOG("new_parse");
@ -1574,13 +1613,27 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) {
ts_parser__halt_parse(self);
break;
}
while (self->included_range_difference_index < self->included_range_differences.size) {
TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index];
if (range->end_byte <= position) {
self->included_range_difference_index++;
} else {
break;
}
}
} while (version_count != 0);
ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language);
LOG("done");
LOG_TREE();
TSTree *result = ts_tree_new(self->finished_tree, self->language);
TSTree *result = ts_tree_new(
self->finished_tree,
self->language,
self->lexer.included_ranges,
self->lexer.included_range_count
);
self->finished_tree = NULL_SUBTREE;
ts_parser_reset(self);
return result;

View file

@ -3,6 +3,8 @@
#include "tree_sitter/runtime.h"
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
static inline TSPoint point__new(unsigned row, unsigned column) {
TSPoint result = {row, column};
return result;

View file

@ -7,25 +7,32 @@
static const unsigned PARENT_CACHE_CAPACITY = 32;
TSTree *ts_tree_new(Subtree root, const TSLanguage *language) {
TSTree *ts_tree_new(
Subtree root, const TSLanguage *language,
const TSRange *included_ranges, unsigned included_range_count
) {
TSTree *result = ts_malloc(sizeof(TSTree));
result->root = root;
result->language = language;
result->parent_cache = NULL;
result->parent_cache_start = 0;
result->parent_cache_size = 0;
result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange));
memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange));
result->included_range_count = included_range_count;
return result;
}
TSTree *ts_tree_copy(const TSTree *self) {
ts_subtree_retain(self->root);
return ts_tree_new(self->root, self->language);
return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count);
}
void ts_tree_delete(TSTree *self) {
SubtreePool pool = ts_subtree_pool_new(0);
ts_subtree_release(&pool, self->root);
ts_subtree_pool_delete(&pool);
ts_free(self->included_ranges);
if (self->parent_cache) ts_free(self->parent_cache);
ts_free(self);
}
@ -39,6 +46,32 @@ const TSLanguage *ts_tree_language(const TSTree *self) {
}
void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *range = &self->included_ranges[i];
if (range->end_byte >= edit->old_end_byte) {
range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte);
range->end_point = point_add(
edit->new_end_point,
point_sub(range->end_point, edit->old_end_point)
);
if (range->end_byte < edit->new_end_byte) {
range->end_byte = UINT32_MAX;
range->end_point = POINT_MAX;
}
if (range->start_byte >= edit->old_end_byte) {
range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte);
range->start_point = point_add(
edit->new_end_point,
point_sub(range->start_point, edit->old_end_point)
);
if (range->start_byte < edit->new_end_byte) {
range->start_byte = UINT32_MAX;
range->start_point = POINT_MAX;
}
}
}
}
SubtreePool pool = ts_subtree_pool_new(0);
self->root = ts_subtree_edit(self->root, edit, &pool);
self->parent_cache_start = 0;
@ -53,10 +86,20 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin
TSNode root = ts_tree_root_node(self);
ts_tree_cursor_init(&cursor1, root);
ts_tree_cursor_init(&cursor2, root);
TSRangeArray included_range_differences = array_new();
ts_range_array_get_changed_ranges(
self->included_ranges, self->included_range_count,
other->included_ranges, other->included_range_count,
&included_range_differences
);
*count = ts_subtree_get_changed_ranges(
&self->root, &other->root, &cursor1, &cursor2,
self->language, &result
self->language, &included_range_differences, &result
);
array_delete(&included_range_differences);
array_delete(&cursor1.stack);
array_delete(&cursor2.stack);
return result;

View file

@ -18,9 +18,11 @@ struct TSTree {
ParentCacheEntry *parent_cache;
uint32_t parent_cache_start;
uint32_t parent_cache_size;
TSRange *included_ranges;
unsigned included_range_count;
};
TSTree *ts_tree_new(Subtree root, const TSLanguage *language);
TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned);
TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol);
TSNode ts_tree_get_cached_parent(const TSTree *, const TSNode *);
void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *);

View file

@ -2,6 +2,7 @@
#include <future>
#include "runtime/alloc.h"
#include "runtime/language.h"
#include "runtime/get_changed_ranges.h"
#include "helpers/record_alloc.h"
#include "helpers/spy_input.h"
#include "helpers/load_language.h"
@ -1002,6 +1003,164 @@ describe("Parser", [&]() {
AssertThat(ts_node_end_point(statement_node1), Equals(extent_for_string("a <%= b()")));
AssertThat(ts_node_end_point(statement_node2), Equals(extent_for_string("a <%= b() %> c <% d()")));
});
it("does not reuse nodes that were parsed in ranges that are now excluded", [&]() {
string source_code = "<div><span><%= something %></span></div>";
// Parse HTML including the template directive, which will cause an error
ts_parser_set_language(parser, load_real_language("html"));
TSTree *first_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size());
// Insert code at the beginning of the document.
string prefix = "a very very long line of plain text. ";
unsigned prefix_length = prefix.size();
TSInputEdit edit = {
0, 0, prefix_length,
{0, 0}, {0, 0}, {0, prefix_length}
};
ts_tree_edit(first_tree, &edit);
source_code = prefix + source_code;
// Parse the HTML again, this time *excluding* the template directive
// (which has moved since the previous parse).
unsigned directive_start = source_code.find("<%=");
unsigned directive_end = source_code.find("</span>");
unsigned source_code_end = source_code.size();
TSRange included_ranges[] = {
{
{0, 0},
{0, directive_start},
0,
directive_start
},
{
{0, directive_end},
{0, source_code_end},
directive_end,
source_code_end
}
};
ts_parser_set_included_ranges(parser, included_ranges, 2);
tree = ts_parser_parse_string(parser, first_tree, source_code.c_str(), source_code.size());
// The error should not have been reused, because the included ranges were different.
assert_root_node("(fragment "
"(text) "
"(element "
"(start_tag (tag_name)) "
"(element "
"(start_tag (tag_name)) "
"(end_tag (tag_name))) "
"(end_tag (tag_name))))");
unsigned range_count;
const TSRange *ranges = ts_tree_get_changed_ranges(first_tree, tree, &range_count);
// The first range that's changed syntax is the range of the
// newly-inserted text.
AssertThat(range_count, Equals(2u));
AssertThat(ranges[0], Equals<TSRange>({
{0, 0}, {0, prefix_length},
0, prefix_length,
}));
// Even though no edits were applied to the outer `div` element,
// its contents have changed syntax because a range of text that
// was previously included is now excluded.
AssertThat(ranges[1], Equals<TSRange>({
{0, directive_start}, {0, directive_end},
directive_start, directive_end,
}));
ts_free((void *)ranges);
ts_tree_delete(first_tree);
});
});
describe("ts_range_array_get_changed_ranges()", [&]() {
auto get_changed_ranges = [&](
const vector<TSRange> &old_ranges,
const vector<TSRange> &new_ranges
) {
TSRangeArray result = array_new();
ts_range_array_get_changed_ranges(
old_ranges.data(), old_ranges.size(),
new_ranges.data(), new_ranges.size(),
&result
);
vector<TSRange> result_vector;
for (unsigned i = 0; i < result.size; i++) {
result_vector.push_back(result.contents[i]);
}
array_delete(&result);
return result_vector;
};
auto range = [&](unsigned start, unsigned end) {
TSRange result;
result.start_byte = start;
result.end_byte = end;
result.start_point = {0, start};
if (end == UINT32_MAX) {
result.end_point = {UINT32_MAX, UINT32_MAX};
} else {
result.end_point = {0, end};
}
return result;
};
it("returns an array of ranges that are newly included excluded", [&]() {
AssertThat(get_changed_ranges(
{
range(0, UINT32_MAX),
},
{
range(0, 5),
range(8, UINT32_MAX),
}
), Equals<vector<TSRange>>(
{
range(5, 8)
}
));
AssertThat(get_changed_ranges(
{
range(0, 3),
range(7, 10),
range(13, 30),
},
{
range(0, 4),
range(8, 11),
range(14, 30),
}
), Equals<vector<TSRange>>(
{
range(3, 4),
range(7, 8),
range(10, 11),
range(13, 14),
}
));
AssertThat(get_changed_ranges(
{
range(0, UINT32_MAX),
},
{
range(0, 4),
range(5, 64),
}
), Equals<vector<TSRange>>(
{
range(4, 5),
range(64, UINT32_MAX),
}
));
});
});
});