From dedcc5255a01c5c072cb29801de5a4bed819daa2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 17 Feb 2025 15:07:44 -0800 Subject: [PATCH] Ignore external tokens that are zero-length and extra (#4213) Co-authored-by: Anthony --- lib/src/parser.c | 34 ++++++++++--------- .../epsilon_external_extra_tokens/corpus.txt | 9 +++++ .../epsilon_external_extra_tokens/grammar.js | 11 ++++++ .../epsilon_external_extra_tokens/scanner.c | 33 ++++++++++++++++++ 4 files changed, 71 insertions(+), 16 deletions(-) create mode 100644 test/fixtures/test_grammars/epsilon_external_extra_tokens/corpus.txt create mode 100644 test/fixtures/test_grammars/epsilon_external_extra_tokens/grammar.js create mode 100644 test/fixtures/test_grammars/epsilon_external_extra_tokens/scanner.c diff --git a/lib/src/parser.c b/lib/src/parser.c index 7aac259e..a3d68592 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -556,27 +556,29 @@ static Subtree ts_parser__lex( external_scanner_state_len ); - // When recovering from an error, ignore any zero-length external tokens - // unless they have changed the external scanner's state. This helps to - // avoid infinite loops which could otherwise occur, because the lexer is - // looking for any possible token, instead of looking for the specific set of - // tokens that are valid in some parse state. + // Avoid infinite loops caused by the external scanner returning empty tokens. + // Empty tokens are needed in some circumstances, e.g. indent/dedent tokens + // in Python. Ignore the following classes of empty tokens: // - // Note that it's possible that the token end position may be *before* the - // original position of the lexer because of the way that tokens are positioned - // at included range boundaries: when a token is terminated at the start of - // an included range, it is marked as ending at the *end* of the preceding - // included range. + // * Tokens produced during error recovery. When recovering from an error, + // all tokens are allowed, so it's easy to accidentally return unwanted + // empty tokens. + // * Tokens that are marked as 'extra' in the grammar. These don't change + // the parse state, so they would definitely cause an infinite loop. if ( self->lexer.token_end_position.bytes <= current_position.bytes && - (error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) && !external_scanner_state_changed ) { - LOG( - "ignore_empty_external_token symbol:%s", - SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]) - ) - found_token = false; + TSSymbol symbol = self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]; + TSStateId next_parse_state = ts_language_next_state(self->language, parse_state, symbol); + bool token_is_extra = (next_parse_state == parse_state); + if (error_mode || !ts_stack_has_advanced_since_error(self->stack, version) || token_is_extra) { + LOG( + "ignore_empty_external_token symbol:%s", + SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]) + ); + found_token = false; + } } } diff --git a/test/fixtures/test_grammars/epsilon_external_extra_tokens/corpus.txt b/test/fixtures/test_grammars/epsilon_external_extra_tokens/corpus.txt new file mode 100644 index 00000000..776db2ec --- /dev/null +++ b/test/fixtures/test_grammars/epsilon_external_extra_tokens/corpus.txt @@ -0,0 +1,9 @@ +========================== +A document +========================== + +a b + +--- + +(document) diff --git a/test/fixtures/test_grammars/epsilon_external_extra_tokens/grammar.js b/test/fixtures/test_grammars/epsilon_external_extra_tokens/grammar.js new file mode 100644 index 00000000..b808de62 --- /dev/null +++ b/test/fixtures/test_grammars/epsilon_external_extra_tokens/grammar.js @@ -0,0 +1,11 @@ +module.exports = grammar({ + name: 'epsilon_external_extra_tokens', + + extras: $ => [/\s/, $.comment], + + externals: $ => [$.comment], + + rules: { + document: $ => seq('a', 'b'), + } +}); diff --git a/test/fixtures/test_grammars/epsilon_external_extra_tokens/scanner.c b/test/fixtures/test_grammars/epsilon_external_extra_tokens/scanner.c new file mode 100644 index 00000000..c8949d1d --- /dev/null +++ b/test/fixtures/test_grammars/epsilon_external_extra_tokens/scanner.c @@ -0,0 +1,33 @@ +#include "tree_sitter/parser.h" + +enum TokenType { + COMMENT +}; + +void *tree_sitter_epsilon_external_extra_tokens_external_scanner_create(void) { + return NULL; +} + +bool tree_sitter_epsilon_external_extra_tokens_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + lexer->result_symbol = COMMENT; + return true; +} + +unsigned tree_sitter_epsilon_external_extra_tokens_external_scanner_serialize( + void *payload, + char *buffer +) { + return 0; +} + +void tree_sitter_epsilon_external_extra_tokens_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) {} + +void tree_sitter_epsilon_external_extra_tokens_external_scanner_destroy(void *payload) {}