From a40045a419e5b0a7818c4dbc0a2ff49c8dbca822 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 11 Mar 2021 14:46:13 -0800 Subject: [PATCH] When editing, properly invalidate trees that depend on get_column --- cli/src/tests/helpers/fixtures.rs | 6 ++ cli/src/tests/parser_test.rs | 79 ++++++++++++++++++- lib/src/lexer.c | 22 +----- lib/src/lexer.h | 11 +-- lib/src/parser.c | 3 + lib/src/subtree.c | 41 ++++++++-- lib/src/subtree.h | 7 +- .../uses_current_column/scanner.c | 2 +- 8 files changed, 136 insertions(+), 35 deletions(-) diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index fc459777..d098bd28 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -74,3 +74,9 @@ pub fn get_test_language(name: &str, parser_code: &str, path: Option<&Path>) -> .load_language_from_sources(name, &HEADER_DIR, &parser_c_path, &scanner_path) .unwrap() } + +pub fn get_test_grammar(name: &str) -> (String, Option) { + let dir = fixtures_dir().join("test_grammars").join(name); + let grammar = fs::read_to_string(&dir.join("grammar.json")).unwrap(); + (grammar, Some(dir)) +} diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index b02f04b2..d623126f 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -1,5 +1,5 @@ use super::helpers::edits::ReadRecorder; -use super::helpers::fixtures::{get_language, get_test_language}; +use super::helpers::fixtures::{get_language, get_test_grammar, get_test_language}; use crate::generate::generate_parser_for_grammar; use crate::parse::{perform_edit, Edit}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -406,6 +406,83 @@ fn test_parsing_empty_file_with_reused_tree() { parser.parse("\n ", tree.as_ref()); } +#[test] +fn test_parsing_after_editing_tree_that_depends_on_column_values() { + let (grammar, path) = get_test_grammar("uses_current_column"); + let (grammar_name, parser_code) = generate_parser_for_grammar(&grammar).unwrap(); + + let mut parser = Parser::new(); + parser + .set_language(get_test_language( + &grammar_name, + &parser_code, + path.as_ref().map(AsRef::as_ref), + )) + .unwrap(); + + let mut code = b" +a = b +c = do d + e + f + g +h + i + " + .to_vec(); + let mut tree = parser.parse(&code, None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(block ", + "(binary_expression (identifier) (identifier)) ", + "(binary_expression (identifier) (do_expression (block (identifier) (binary_expression (identifier) (identifier)) (identifier)))) ", + "(binary_expression (identifier) (identifier)))", + ) + ); + + perform_edit( + &mut tree, + &mut code, + &Edit { + position: 8, + deleted_length: 0, + inserted_text: b"1234".to_vec(), + }, + ); + + assert_eq!( + code, + b" +a = b +c1234 = do d + e + f + g +h + i + " + ); + + let mut recorder = ReadRecorder::new(&code); + let tree = parser + .parse_with(&mut |i, _| recorder.read(i), Some(&tree)) + .unwrap(); + + assert_eq!( + tree.root_node().to_sexp(), + concat!( + "(block ", + "(binary_expression (identifier) (identifier)) ", + "(binary_expression (identifier) (do_expression (block (identifier)))) ", + "(binary_expression (identifier) (identifier)) ", + "(identifier) ", + "(binary_expression (identifier) (identifier)))", + ) + ); + + assert_eq!( + recorder.strings_read(), + vec!["\nc1234 = do d\n e + f\n g\n"] + ); +} + // Thread safety #[test] diff --git a/lib/src/lexer.c b/lib/src/lexer.c index f349d76f..5d1965ad 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -233,25 +233,8 @@ static void ts_lexer__mark_end(TSLexer *_self) { static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; - uint32_t goal_byte = self->current_position.bytes; - - ts_lexer_goto(self, (Length) { - .bytes = self->current_position.bytes - self->current_position.extent.column, - .extent = { - .row = self->current_position.extent.row, - .column = 0, - } - }); - if (!self->chunk_size) ts_lexer__get_chunk(self); - if (!self->lookahead_size) ts_lexer__get_lookahead(self); - - uint32_t result = 0; - while (self->current_position.bytes < goal_byte) { - ts_lexer__advance(&self->data, false); - result++; - } - - return result; + self->did_get_column = true; + return self->current_position.extent.column; } // Is the lexer at a boundary between two disjoint included ranges of @@ -318,6 +301,7 @@ void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->token_end_position = LENGTH_UNDEFINED; self->data.result_symbol = 0; + self->did_get_column = false; if (!ts_lexer__eof(&self->data)) { if (!self->chunk_size) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); diff --git a/lib/src/lexer.h b/lib/src/lexer.h index 5e392945..c1a5bfdb 100644 --- a/lib/src/lexer.h +++ b/lib/src/lexer.h @@ -17,16 +17,17 @@ typedef struct { Length token_end_position; TSRange *included_ranges; - size_t included_range_count; - size_t current_included_range_index; - const char *chunk; + TSInput input; + TSLogger logger; + + uint32_t included_range_count; + uint32_t current_included_range_index; uint32_t chunk_start; uint32_t chunk_size; uint32_t lookahead_size; + bool did_get_column; - TSInput input; - TSLogger logger; char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; } Lexer; diff --git a/lib/src/parser.c b/lib/src/parser.c index 35069f63..0f0b4ac4 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -403,6 +403,7 @@ static Subtree ts_parser__lex( bool found_external_token = false; bool error_mode = parse_state == ERROR_STATE; bool skipped_error = false; + bool called_get_column = false; int32_t first_error_character = 0; Length error_start_position = length_zero(); Length error_end_position = length_zero(); @@ -445,6 +446,7 @@ static Subtree ts_parser__lex( (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) )) { found_external_token = true; + called_get_column = self->lexer.did_get_column; break; } @@ -546,6 +548,7 @@ static Subtree ts_parser__lex( lookahead_bytes, parse_state, found_external_token, + called_get_column, is_keyword, self->language ); diff --git a/lib/src/subtree.c b/lib/src/subtree.c index e90dc9d7..e5f253ea 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -166,7 +166,8 @@ static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t l Subtree ts_subtree_new_leaf( SubtreePool *pool, TSSymbol symbol, Length padding, Length size, - uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens, + uint32_t lookahead_bytes, TSStateId parse_state, + bool has_external_tokens, bool depends_on_column, bool is_keyword, const TSLanguage *language ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); @@ -213,6 +214,7 @@ Subtree ts_subtree_new_leaf( .fragile_right = false, .has_changes = false, .has_external_tokens = has_external_tokens, + .depends_on_column = depends_on_column, .is_missing = false, .is_keyword = is_keyword, {{.first_leaf = {.symbol = 0, .parse_state = 0}}} @@ -245,7 +247,7 @@ Subtree ts_subtree_new_error( ) { Subtree result = ts_subtree_new_leaf( pool, ts_builtin_sym_error, padding, size, bytes_scanned, - parse_state, false, false, language + parse_state, false, false, false, language ); SubtreeHeapData *data = (SubtreeHeapData *)result.ptr; data->fragile_left = true; @@ -378,6 +380,7 @@ void ts_subtree_summarize_children( self.ptr->repeat_depth = 0; self.ptr->node_count = 1; self.ptr->has_external_tokens = false; + self.ptr->depends_on_column = false; self.ptr->dynamic_precedence = 0; uint32_t structural_index = 0; @@ -388,6 +391,13 @@ void ts_subtree_summarize_children( for (uint32_t i = 0; i < self.ptr->child_count; i++) { Subtree child = children[i]; + if ( + self.ptr->size.extent.row == 0 && + ts_subtree_depends_on_column(child) + ) { + self.ptr->depends_on_column = true; + } + if (i == 0) { self.ptr->padding = ts_subtree_padding(child); self.ptr->size = ts_subtree_size(child); @@ -545,7 +555,7 @@ Subtree ts_subtree_new_missing_leaf( ) { Subtree result = ts_subtree_new_leaf( pool, symbol, padding, length_zero(), 0, - 0, false, false, language + 0, false, false, false, language ); if (result.data.is_inline) { result.data.is_missing = true; @@ -670,6 +680,7 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool Edit edit = entry.edit; bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes; bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; + bool invalidate_first_row = ts_subtree_depends_on_column(*entry.tree); Length size = ts_subtree_size(*entry.tree); Length padding = ts_subtree_padding(*entry.tree); @@ -733,6 +744,7 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool data->fragile_right = false; data->has_changes = false; data->has_external_tokens = false; + data->depends_on_column = false; data->is_missing = result.data.is_missing; data->is_keyword = result.data.is_keyword; result.ptr = data; @@ -755,9 +767,18 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool // If this child ends before the edit, it is not affected. if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue; - // If this child starts after the edit, then we're done processing children. - if (child_left.bytes > edit.old_end.bytes || - (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)) break; + // Keep editing child nodes until a node is reached that starts after the edit. + // Also, if this node's validity depends on its column position, then continue + // invaliditing child nodes until reaching a line break. + if (( + (child_left.bytes > edit.old_end.bytes) || + (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0) + ) && ( + !invalidate_first_row || + child_left.extent.row > entry.tree->ptr->padding.extent.row + )) { + break; + } // Transform edit into the child's coordinate space. Edit child_edit = { @@ -775,8 +796,10 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool // Interpret all inserted text as applying to the *first* child that touches the edit. // Subsequent children are only never have any text inserted into them; they are only // shrunk to compensate for the edit. - if (child_right.bytes > edit.start.bytes || - (child_right.bytes == edit.start.bytes && is_pure_insertion)) { + if ( + child_right.bytes > edit.start.bytes || + (child_right.bytes == edit.start.bytes && is_pure_insertion) + ) { edit.new_end = edit.start; } @@ -981,12 +1004,14 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, "state: %d\n" "error-cost: %u\n" "has-changes: %u\n" + "depends-on-column: %u\n" "repeat-depth: %u\n" "lookahead-bytes: %u", start_offset, end_offset, ts_subtree_parse_state(*self), ts_subtree_error_cost(*self), ts_subtree_has_changes(*self), + ts_subtree_depends_on_column(*self), ts_subtree_repeat_depth(*self), ts_subtree_lookahead_bytes(*self) ); diff --git a/lib/src/subtree.h b/lib/src/subtree.h index b020deb6..d227db10 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -78,6 +78,7 @@ typedef struct { bool fragile_right : 1; bool has_changes : 1; bool has_external_tokens : 1; + bool depends_on_column: 1; bool is_missing : 1; bool is_keyword : 1; @@ -138,7 +139,7 @@ void ts_subtree_pool_delete(SubtreePool *); Subtree ts_subtree_new_leaf( SubtreePool *, TSSymbol, Length, Length, uint32_t, - TSStateId, bool, bool, const TSLanguage * + TSStateId, bool, bool, bool, const TSLanguage * ); Subtree ts_subtree_new_error( SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * @@ -284,6 +285,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) { return self.data.is_inline ? false : self.ptr->has_external_tokens; } +static inline bool ts_subtree_depends_on_column(Subtree self) { + return self.data.is_inline ? false : self.ptr->depends_on_column; +} + static inline bool ts_subtree_is_fragile(Subtree self) { return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right); } diff --git a/test/fixtures/test_grammars/uses_current_column/scanner.c b/test/fixtures/test_grammars/uses_current_column/scanner.c index efd27f9f..62b16392 100644 --- a/test/fixtures/test_grammars/uses_current_column/scanner.c +++ b/test/fixtures/test_grammars/uses_current_column/scanner.c @@ -92,7 +92,7 @@ bool tree_sitter_uses_current_column_external_scanner_scan( // If at the end of a statement, then get the current indent // level and pop some number of entries off of the indent stack. if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) { - while (lexer->lookahead == ' ') { + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') { lexer->advance(lexer, false); }