Avoid creating external tokens that start after they end

This commit is contained in:
Max Brunsfeld 2017-12-07 11:48:31 -08:00
parent 90629bd45a
commit fbcefe25f7
5 changed files with 140 additions and 0 deletions

View file

@ -418,6 +418,10 @@ static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_sta
symbol = self->language->external_scanner.symbol_map[symbol];
}
if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) {
self->lexer.token_start_position = self->lexer.token_end_position;
}
Length padding = length_sub(self->lexer.token_start_position, start_position);
Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
result = ts_tree_make_leaf(symbol, padding, size, self->language);

View file

@ -0,0 +1,32 @@
========================
Expressions on one line
=========================
a
b
.c
d
.e
.f
---
(program
(statement (identifier) (line_break))
(statement (member_expression (identifier) (identifier)) (line_break))
(statement (member_expression (member_expression (identifier) (identifier)) (identifier)) (line_break)))
=====================================
Line breaks followed by whitespace
=====================================
a
b
c
---
(program
(statement (identifier) (line_break))
(statement (identifier) (line_break))
(statement (identifier) (line_break)))

View file

@ -0,0 +1,55 @@
{
"name": "inverted_external_token",
"externals": [
{"type": "SYMBOL", "name": "line_break"}
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"program": {
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "statement"
},
},
"statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "line_break"}
]
},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "member_expression"}
]
},
"member_expression": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "STRING", "value": "."},
{"type": "SYMBOL", "name": "identifier"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "\\a+"
}
}
}

View file

@ -0,0 +1 @@
This language has an external scanner that calls `lexer->advance(lexer, true)` (in order to skip whitespace) *after* having called `lexer->mark_end(lexer)`. This tests an edge case in the parser's handling of token start and end positions.

View file

@ -0,0 +1,48 @@
#include <tree_sitter/parser.h>
enum {
LINE_BREAK
};
void *tree_sitter_inverted_external_token_external_scanner_create() { return NULL; }
void tree_sitter_inverted_external_token_external_scanner_destroy(void *payload) {}
void tree_sitter_inverted_external_token_external_scanner_reset(void *payload) {}
unsigned tree_sitter_inverted_external_token_external_scanner_serialize(
void *payload,
char *buffer
) { return true; }
void tree_sitter_inverted_external_token_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
bool tree_sitter_inverted_external_token_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
while (lexer->lookahead == ' ') {
lexer->advance(lexer, true);
}
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
// Mark the end of the line break token.
lexer->mark_end(lexer);
// Skip whitespace *after* having marked the end.
while (lexer->lookahead == ' ' || lexer->lookahead == '\n') {
lexer->advance(lexer, true);
}
if (lexer->lookahead != '.') {
lexer->result_symbol = LINE_BREAK;
return true;
}
}
return false;
}