From 714a45c71b38cb425e886dcaa125042d731178c4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 14 Nov 2018 11:30:34 -0800 Subject: [PATCH] Fix error in lookahead_bytes calculation This caused word tokens to have lookahead byte values that were way too large, making ts_subtree_edit very expensive. --- src/runtime/lexer.c | 20 +++++++++++++ src/runtime/lexer.h | 1 + src/runtime/parser.c | 71 ++++++++++++++++++-------------------------- 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 42c7e668..d2b9ad70 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -250,6 +250,26 @@ void ts_lexer_start(Lexer *self) { if (!self->lookahead_size) ts_lexer__get_lookahead(self); } +void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { + if (length_is_undefined(self->token_end_position)) { + ts_lexer__mark_end(&self->data); + } + + uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; + + // In order to determine that a byte sequence is invalid UTF8 or UTF16, + // the character decoding algorithm may have looked at the following byte. + // Therefore, the next byte *after* the current (invalid) character + // affects the interpretation of the current character. + if (self->data.lookahead == -1) { + current_lookahead_end_byte++; + } + + if (current_lookahead_end_byte > *lookahead_end_byte) { + *lookahead_end_byte = current_lookahead_end_byte; + } +} + void ts_lexer_advance_to_end(Lexer *self) { while (self->data.lookahead != 0) { ts_lexer__advance((TSLexer *)self, false); diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index c926d9e8..491c2da1 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -35,6 +35,7 @@ void ts_lexer_delete(Lexer *); void ts_lexer_set_input(Lexer *, TSInput); void ts_lexer_reset(Lexer *, Length); void ts_lexer_start(Lexer *); +void ts_lexer_finish(Lexer *, uint32_t *); void ts_lexer_advance_to_end(Lexer *); void ts_lexer_mark_end(Lexer *); void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 3bcbfa42..556a11ec 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -317,7 +317,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa int32_t first_error_character = 0; Length error_start_position = length_zero(); Length error_end_position = length_zero(); - uint32_t last_byte_scanned = start_position.bytes; + uint32_t lookahead_end_byte = 0; ts_lexer_reset(&self->lexer, start_position); for (;;) { @@ -332,39 +332,33 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa ); ts_lexer_start(&self->lexer); ts_parser__restore_external_scanner(self, external_token); - if (self->language->external_scanner.scan( + bool found_token = self->language->external_scanner.scan( self->external_scanner_payload, &self->lexer.data, valid_external_tokens + ); + ts_lexer_finish(&self->lexer, &lookahead_end_byte); + + // Zero-length external tokens are generally allowed, but they're not + // allowed right after a syntax error. This is for two reasons: + // 1. After a syntax error, the lexer is looking for any possible token, + // as opposed to the specific set of tokens that are valid in some + // parse state. In this situation, it's very easy for an external + // scanner to produce unwanted zero-length tokens. + // 2. The parser sometimes inserts *missing* tokens to recover from + // errors. These tokens are also zero-length. If we allow more + // zero-length tokens to be created after missing tokens, it + // can lead to infinite loops. Forbidding zero-length tokens + // right at the point of error recovery is a conservative strategy + // for preventing this kind of infinite loop. + if (found_token && ( + self->lexer.token_end_position.bytes > current_position.bytes || + (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) )) { - if (length_is_undefined(self->lexer.token_end_position)) { - self->lexer.data.mark_end(&self->lexer.data); - } - - // Zero-length external tokens are generally allowed, but they're not - // allowed right after a syntax error. This is for two reasons: - // 1. After a syntax error, the lexer is looking for any possible token, - // as opposed to the specific set of tokens that are valid in some - // parse state. In this situation, it's very easy for an external - // scanner to produce unwanted zero-length tokens. - // 2. The parser sometimes inserts *missing* tokens to recover from - // errors. These tokens are also zero-length. If we allow more - // zero-length tokens to be created after missing tokens, it - // can lead to infinite loops. Forbidding zero-length tokens - // right at the point of error recovery is a conservative strategy - // for preventing this kind of infinite loop. - if ( - self->lexer.token_end_position.bytes > current_position.bytes || - (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) - ) { - found_external_token = true; - break; - } + found_external_token = true; + break; } - if (self->lexer.current_position.bytes > last_byte_scanned) { - last_byte_scanned = self->lexer.current_position.bytes; - } ts_lexer_reset(&self->lexer, current_position); } @@ -375,9 +369,9 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa current_position.extent.column ); ts_lexer_start(&self->lexer); - if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) { - break; - } + bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); + ts_lexer_finish(&self->lexer, &lookahead_end_byte); + if (found_token) break; if (!error_mode) { error_mode = true; @@ -386,9 +380,6 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa self->language, lex_mode.external_lex_state ); - if (self->lexer.current_position.bytes > last_byte_scanned) { - last_byte_scanned = self->lexer.current_position.bytes; - } ts_lexer_reset(&self->lexer, start_position); continue; } @@ -412,22 +403,17 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa error_end_position = self->lexer.current_position; } - if (self->lexer.current_position.bytes > last_byte_scanned) { - last_byte_scanned = self->lexer.current_position.bytes; - } - - last_byte_scanned++; - Subtree result; if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); + uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes; result = ts_subtree_new_error( &self->tree_pool, first_error_character, padding, size, - last_byte_scanned - error_end_position.bytes, + lookahead_bytes, parse_state, self->language ); @@ -440,6 +426,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa TSSymbol symbol = self->lexer.data.result_symbol; Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); + uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes; if (found_external_token) { symbol = self->language->external_scanner.symbol_map[symbol]; @@ -462,7 +449,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa symbol, padding, size, - last_byte_scanned - self->lexer.token_end_position.bytes, + lookahead_bytes, parse_state, found_external_token, is_keyword,