Ignore external tokens that are zero-length and extra (#4213)

Co-authored-by: Anthony <anthony@zed.dev>
This commit is contained in:
Max Brunsfeld 2025-02-17 15:07:44 -08:00 committed by GitHub
parent 14b8ead412
commit dedcc5255a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 71 additions and 16 deletions

View file

@ -556,27 +556,29 @@ static Subtree ts_parser__lex(
external_scanner_state_len
);
// When recovering from an error, ignore any zero-length external tokens
// unless they have changed the external scanner's state. This helps to
// avoid infinite loops which could otherwise occur, because the lexer is
// looking for any possible token, instead of looking for the specific set of
// tokens that are valid in some parse state.
// Avoid infinite loops caused by the external scanner returning empty tokens.
// Empty tokens are needed in some circumstances, e.g. indent/dedent tokens
// in Python. Ignore the following classes of empty tokens:
//
// Note that it's possible that the token end position may be *before* the
// original position of the lexer because of the way that tokens are positioned
// at included range boundaries: when a token is terminated at the start of
// an included range, it is marked as ending at the *end* of the preceding
// included range.
// * Tokens produced during error recovery. When recovering from an error,
// all tokens are allowed, so it's easy to accidentally return unwanted
// empty tokens.
// * Tokens that are marked as 'extra' in the grammar. These don't change
// the parse state, so they would definitely cause an infinite loop.
if (
self->lexer.token_end_position.bytes <= current_position.bytes &&
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) &&
!external_scanner_state_changed
) {
LOG(
"ignore_empty_external_token symbol:%s",
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
)
found_token = false;
TSSymbol symbol = self->language->external_scanner.symbol_map[self->lexer.data.result_symbol];
TSStateId next_parse_state = ts_language_next_state(self->language, parse_state, symbol);
bool token_is_extra = (next_parse_state == parse_state);
if (error_mode || !ts_stack_has_advanced_since_error(self->stack, version) || token_is_extra) {
LOG(
"ignore_empty_external_token symbol:%s",
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
);
found_token = false;
}
}
}

View file

@ -0,0 +1,9 @@
==========================
A document
==========================
a b
---
(document)

View file

@ -0,0 +1,11 @@
module.exports = grammar({
name: 'epsilon_external_extra_tokens',
extras: $ => [/\s/, $.comment],
externals: $ => [$.comment],
rules: {
document: $ => seq('a', 'b'),
}
});

View file

@ -0,0 +1,33 @@
#include "tree_sitter/parser.h"
enum TokenType {
COMMENT
};
void *tree_sitter_epsilon_external_extra_tokens_external_scanner_create(void) {
return NULL;
}
bool tree_sitter_epsilon_external_extra_tokens_external_scanner_scan(
void *payload,
TSLexer *lexer,
const bool *valid_symbols
) {
lexer->result_symbol = COMMENT;
return true;
}
unsigned tree_sitter_epsilon_external_extra_tokens_external_scanner_serialize(
void *payload,
char *buffer
) {
return 0;
}
void tree_sitter_epsilon_external_extra_tokens_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
void tree_sitter_epsilon_external_extra_tokens_external_scanner_destroy(void *payload) {}