diff --git a/src/runtime/parser.c b/src/runtime/parser.c index e843cbf0..ef6592ef 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -418,6 +418,10 @@ static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_sta symbol = self->language->external_scanner.symbol_map[symbol]; } + if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { + self->lexer.token_start_position = self->lexer.token_end_position; + } + Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); result = ts_tree_make_leaf(symbol, padding, size, self->language); diff --git a/test/fixtures/test_grammars/inverted_external_token/corpus.txt b/test/fixtures/test_grammars/inverted_external_token/corpus.txt new file mode 100644 index 00000000..464ddfb4 --- /dev/null +++ b/test/fixtures/test_grammars/inverted_external_token/corpus.txt @@ -0,0 +1,32 @@ +======================== +Expressions on one line +========================= + +a +b + .c +d + .e + .f + +--- + +(program + (statement (identifier) (line_break)) + (statement (member_expression (identifier) (identifier)) (line_break)) + (statement (member_expression (member_expression (identifier) (identifier)) (identifier)) (line_break))) + +===================================== +Line breaks followed by whitespace +===================================== + +a + b + c + +--- + +(program + (statement (identifier) (line_break)) + (statement (identifier) (line_break)) + (statement (identifier) (line_break))) diff --git a/test/fixtures/test_grammars/inverted_external_token/grammar.json b/test/fixtures/test_grammars/inverted_external_token/grammar.json new file mode 100644 index 00000000..dc837948 --- /dev/null +++ b/test/fixtures/test_grammars/inverted_external_token/grammar.json @@ -0,0 +1,55 @@ +{ + "name": "inverted_external_token", + + "externals": [ + {"type": "SYMBOL", "name": "line_break"} + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "program": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "statement" + }, + }, + + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "member_expression"} + ] + }, + + "member_expression": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "STRING", "value": "."}, + {"type": "SYMBOL", "name": "identifier"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } +} diff --git a/test/fixtures/test_grammars/inverted_external_token/readme.md b/test/fixtures/test_grammars/inverted_external_token/readme.md new file mode 100644 index 00000000..f555dbc2 --- /dev/null +++ b/test/fixtures/test_grammars/inverted_external_token/readme.md @@ -0,0 +1 @@ +This language has an external scanner that calls `lexer->advance(lexer, true)` (in order to skip whitespace) *after* having called `lexer->mark_end(lexer)`. This tests an edge case in the parser's handling of token start and end positions. diff --git a/test/fixtures/test_grammars/inverted_external_token/scanner.c b/test/fixtures/test_grammars/inverted_external_token/scanner.c new file mode 100644 index 00000000..c2cd6ce9 --- /dev/null +++ b/test/fixtures/test_grammars/inverted_external_token/scanner.c @@ -0,0 +1,48 @@ +#include + +enum { + LINE_BREAK +}; + +void *tree_sitter_inverted_external_token_external_scanner_create() { return NULL; } + +void tree_sitter_inverted_external_token_external_scanner_destroy(void *payload) {} + +void tree_sitter_inverted_external_token_external_scanner_reset(void *payload) {} + +unsigned tree_sitter_inverted_external_token_external_scanner_serialize( + void *payload, + char *buffer +) { return true; } + +void tree_sitter_inverted_external_token_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) {} + +bool tree_sitter_inverted_external_token_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + + // Mark the end of the line break token. + lexer->mark_end(lexer); + + // Skip whitespace *after* having marked the end. + while (lexer->lookahead == ' ' || lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead != '.') { + lexer->result_symbol = LINE_BREAK; + return true; + } + } + + return false; +}