Fix behavior of Lexer.get_column when at EOF

This commit is contained in:
Max Brunsfeld 2021-03-11 11:25:10 -08:00
parent 57036b4f8a
commit e29d3714f7
4 changed files with 337 additions and 56 deletions

View file

@ -102,6 +102,56 @@ static void ts_lexer__get_lookahead(Lexer *self) {
}
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
// Advance to the next character in the source code, retrieving a new
// chunk of source code if needed.
static void ts_lexer__advance(TSLexer *_self, bool skip) {
@ -185,12 +235,15 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->current_position.bytes -= self->current_position.extent.column;
self->current_position.extent.column = 0;
if (self->current_position.bytes < self->chunk_start) {
ts_lexer__get_chunk(self);
}
ts_lexer_goto(self, (Length) {
.bytes = self->current_position.bytes - self->current_position.extent.column,
.extent = {
.row = self->current_position.extent.row,
.column = 0,
}
});
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
uint32_t result = 0;
while (self->current_position.bytes < goal_byte) {
@ -247,56 +300,6 @@ void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
ts_lexer__clear_chunk(self);

View file

@ -0,0 +1,76 @@
===============
Simple blocks
===============
do a
e
f
---
(block
(do_expression (block
(identifier)
(identifier)))
(identifier))
=====================
Nested blocks
=====================
a = do b
c + do e
f
g
h
i
---
(block
(binary_expression
(identifier)
(do_expression (block
(identifier)
(binary_expression
(identifier)
(do_expression (block
(identifier)
(identifier)
(identifier))))
(identifier))))
(identifier))
===============================
Blocks with leading newlines
===============================
do
a = b
do
c
d
e
f
---
(block
(do_expression (block
(binary_expression (identifier) (identifier))
(do_expression (block
(identifier)
(identifier)))
(identifier)
(identifier))))
=====================
Unterminated blocks
=====================
do
---
(ERROR)

View file

@ -0,0 +1,69 @@
{
"name": "uses_current_column",
"externals": [
{"type": "SYMBOL", "name": "_indent"},
{"type": "SYMBOL", "name": "_dedent"},
{"type": "SYMBOL", "name": "_newline"}
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"block": {
"type": "REPEAT1",
"content": {"type": "SYMBOL", "name": "_statement"}
},
"_statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "_newline"}
]
},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "do_expression"},
{"type": "SYMBOL", "name": "binary_expression"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"do_expression": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "do"},
{"type": "SYMBOL", "name": "_indent"},
{"type": "SYMBOL", "name": "block"},
{"type": "SYMBOL", "name": "_dedent"}
]
},
"binary_expression": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{
"type": "CHOICE",
"members": [
{"type": "STRING", "value": "="},
{"type": "STRING", "value": "+"},
{"type": "STRING", "value": "-"}
]
},
{"type": "SYMBOL", "name": "_expression"}
]
}
},
"identifier": {"type": "PATTERN", "value": "\\w+"}
}
}

View file

@ -0,0 +1,133 @@
#include <stdlib.h>
#include <wctype.h>
#include <tree_sitter/parser.h>
enum TokenType {
INDENT,
DEDENT,
NEWLINE,
};
typedef struct {
uint8_t queued_dedent_count;
uint8_t indent_count;
int8_t indents[32];
} Scanner;
void *tree_sitter_uses_current_column_external_scanner_create() {
Scanner *self = malloc(sizeof(Scanner));
self->queued_dedent_count = 0;
self->indent_count = 1;
self->indents[0] = 0;
return (void *)self;
}
void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) {
free(payload);
}
unsigned tree_sitter_uses_current_column_external_scanner_serialize(
void *payload,
char *buffer
) {
Scanner *self = (Scanner *)payload;
buffer[0] = self->queued_dedent_count;
for (unsigned i = 0; i < self->indent_count; i++) {
buffer[i + 1] = self->indents[i];
}
return self->indent_count + 1;
}
void tree_sitter_uses_current_column_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {
Scanner *self = (Scanner *)payload;
if (length > 0) {
self->queued_dedent_count = buffer[0];
self->indent_count = length - 1;
for (unsigned i = 0; i < self->indent_count; i++) {
self->indents[i] = buffer[i + 1];
}
} else {
self->queued_dedent_count = 0;
self->indent_count = 1;
self->indents[0] = 0;
}
}
bool tree_sitter_uses_current_column_external_scanner_scan(
void *payload,
TSLexer *lexer,
const bool *valid_symbols
) {
Scanner *self = (Scanner *)payload;
lexer->mark_end(lexer);
// If dedents were found in a previous run, and are valid now,
// then return a dedent.
if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) {
lexer->result_symbol = DEDENT;
self->queued_dedent_count--;
return true;
}
// If an indent is valid, then add an entry to the indent stack
// for the current column, and return an indent.
if (valid_symbols[INDENT]) {
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, false);
}
uint32_t column = lexer->get_column(lexer);
if (column > self->indents[self->indent_count - 1]) {
self->indents[self->indent_count++] = column - 2;
lexer->result_symbol = INDENT;
return true;
} else {
return false;
}
}
// If at the end of a statement, then get the current indent
// level and pop some number of entries off of the indent stack.
if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) {
while (lexer->lookahead == ' ') {
lexer->advance(lexer, false);
}
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
uint32_t next_column = 0;
for (;;) {
if (lexer->lookahead == ' ') {
next_column++;
lexer->advance(lexer, false);
} else if (lexer->lookahead == '\n') {
next_column = 0;
lexer->advance(lexer, false);
} else {
break;
}
}
unsigned dedent_count = 0;
while (next_column < self->indents[self->indent_count - 1]) {
dedent_count++;
self->indent_count--;
}
if (dedent_count > 0 && valid_symbols[DEDENT]) {
lexer->result_symbol = DEDENT;
return true;
} else if (valid_symbols[NEWLINE]) {
self->queued_dedent_count += dedent_count;
lexer->result_symbol = NEWLINE;
return true;
}
}
}
return false;
}