Fix error in lookahead_bytes calculation

This caused word tokens to have lookahead byte values that were
way too large, making ts_subtree_edit very expensive.
This commit is contained in:
Max Brunsfeld 2018-11-14 11:30:34 -08:00
parent cdbe5a5355
commit 714a45c71b
3 changed files with 50 additions and 42 deletions

View file

@ -317,7 +317,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
int32_t first_error_character = 0;
Length error_start_position = length_zero();
Length error_end_position = length_zero();
uint32_t last_byte_scanned = start_position.bytes;
uint32_t lookahead_end_byte = 0;
ts_lexer_reset(&self->lexer, start_position);
for (;;) {
@ -332,39 +332,33 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
);
ts_lexer_start(&self->lexer);
ts_parser__restore_external_scanner(self, external_token);
if (self->language->external_scanner.scan(
bool found_token = self->language->external_scanner.scan(
self->external_scanner_payload,
&self->lexer.data,
valid_external_tokens
);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
// Zero-length external tokens are generally allowed, but they're not
// allowed right after a syntax error. This is for two reasons:
// 1. After a syntax error, the lexer is looking for any possible token,
// as opposed to the specific set of tokens that are valid in some
// parse state. In this situation, it's very easy for an external
// scanner to produce unwanted zero-length tokens.
// 2. The parser sometimes inserts *missing* tokens to recover from
// errors. These tokens are also zero-length. If we allow more
// zero-length tokens to be created after missing tokens, it
// can lead to infinite loops. Forbidding zero-length tokens
// right at the point of error recovery is a conservative strategy
// for preventing this kind of infinite loop.
if (found_token && (
self->lexer.token_end_position.bytes > current_position.bytes ||
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
)) {
if (length_is_undefined(self->lexer.token_end_position)) {
self->lexer.data.mark_end(&self->lexer.data);
}
// Zero-length external tokens are generally allowed, but they're not
// allowed right after a syntax error. This is for two reasons:
// 1. After a syntax error, the lexer is looking for any possible token,
// as opposed to the specific set of tokens that are valid in some
// parse state. In this situation, it's very easy for an external
// scanner to produce unwanted zero-length tokens.
// 2. The parser sometimes inserts *missing* tokens to recover from
// errors. These tokens are also zero-length. If we allow more
// zero-length tokens to be created after missing tokens, it
// can lead to infinite loops. Forbidding zero-length tokens
// right at the point of error recovery is a conservative strategy
// for preventing this kind of infinite loop.
if (
self->lexer.token_end_position.bytes > current_position.bytes ||
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
) {
found_external_token = true;
break;
}
found_external_token = true;
break;
}
if (self->lexer.current_position.bytes > last_byte_scanned) {
last_byte_scanned = self->lexer.current_position.bytes;
}
ts_lexer_reset(&self->lexer, current_position);
}
@ -375,9 +369,9 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
current_position.extent.column
);
ts_lexer_start(&self->lexer);
if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) {
break;
}
bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
if (found_token) break;
if (!error_mode) {
error_mode = true;
@ -386,9 +380,6 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
self->language,
lex_mode.external_lex_state
);
if (self->lexer.current_position.bytes > last_byte_scanned) {
last_byte_scanned = self->lexer.current_position.bytes;
}
ts_lexer_reset(&self->lexer, start_position);
continue;
}
@ -412,22 +403,17 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
error_end_position = self->lexer.current_position;
}
if (self->lexer.current_position.bytes > last_byte_scanned) {
last_byte_scanned = self->lexer.current_position.bytes;
}
last_byte_scanned++;
Subtree result;
if (skipped_error) {
Length padding = length_sub(error_start_position, start_position);
Length size = length_sub(error_end_position, error_start_position);
uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes;
result = ts_subtree_new_error(
&self->tree_pool,
first_error_character,
padding,
size,
last_byte_scanned - error_end_position.bytes,
lookahead_bytes,
parse_state,
self->language
);
@ -440,6 +426,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
TSSymbol symbol = self->lexer.data.result_symbol;
Length padding = length_sub(self->lexer.token_start_position, start_position);
Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes;
if (found_external_token) {
symbol = self->language->external_scanner.symbol_map[symbol];
@ -462,7 +449,7 @@ static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId pa
symbol,
padding,
size,
last_byte_scanned - self->lexer.token_end_position.bytes,
lookahead_bytes,
parse_state,
found_external_token,
is_keyword,