Fix behavior of Lexer.get_column when at EOF
This commit is contained in:
parent
57036b4f8a
commit
e29d3714f7
4 changed files with 337 additions and 56 deletions
115
lib/src/lexer.c
115
lib/src/lexer.c
|
|
@ -102,6 +102,56 @@ static void ts_lexer__get_lookahead(Lexer *self) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ts_lexer_goto(Lexer *self, Length position) {
|
||||
self->current_position = position;
|
||||
bool found_included_range = false;
|
||||
|
||||
// Move to the first valid position at or after the given position.
|
||||
for (unsigned i = 0; i < self->included_range_count; i++) {
|
||||
TSRange *included_range = &self->included_ranges[i];
|
||||
if (included_range->end_byte > position.bytes) {
|
||||
if (included_range->start_byte > position.bytes) {
|
||||
self->current_position = (Length) {
|
||||
.bytes = included_range->start_byte,
|
||||
.extent = included_range->start_point,
|
||||
};
|
||||
}
|
||||
|
||||
self->current_included_range_index = i;
|
||||
found_included_range = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_included_range) {
|
||||
// If the current position is outside of the current chunk of text,
|
||||
// then clear out the current chunk of text.
|
||||
if (self->chunk && (
|
||||
position.bytes < self->chunk_start ||
|
||||
position.bytes >= self->chunk_start + self->chunk_size
|
||||
)) {
|
||||
ts_lexer__clear_chunk(self);
|
||||
}
|
||||
|
||||
self->lookahead_size = 0;
|
||||
self->data.lookahead = '\0';
|
||||
}
|
||||
|
||||
// If the given position is beyond any of included ranges, move to the EOF
|
||||
// state - past the end of the included ranges.
|
||||
else {
|
||||
self->current_included_range_index = self->included_range_count;
|
||||
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
|
||||
self->current_position = (Length) {
|
||||
.bytes = last_included_range->end_byte,
|
||||
.extent = last_included_range->end_point,
|
||||
};
|
||||
ts_lexer__clear_chunk(self);
|
||||
self->lookahead_size = 1;
|
||||
self->data.lookahead = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// Advance to the next character in the source code, retrieving a new
|
||||
// chunk of source code if needed.
|
||||
static void ts_lexer__advance(TSLexer *_self, bool skip) {
|
||||
|
|
@ -185,12 +235,15 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) {
|
|||
Lexer *self = (Lexer *)_self;
|
||||
uint32_t goal_byte = self->current_position.bytes;
|
||||
|
||||
self->current_position.bytes -= self->current_position.extent.column;
|
||||
self->current_position.extent.column = 0;
|
||||
|
||||
if (self->current_position.bytes < self->chunk_start) {
|
||||
ts_lexer__get_chunk(self);
|
||||
}
|
||||
ts_lexer_goto(self, (Length) {
|
||||
.bytes = self->current_position.bytes - self->current_position.extent.column,
|
||||
.extent = {
|
||||
.row = self->current_position.extent.row,
|
||||
.column = 0,
|
||||
}
|
||||
});
|
||||
if (!self->chunk_size) ts_lexer__get_chunk(self);
|
||||
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
|
||||
|
||||
uint32_t result = 0;
|
||||
while (self->current_position.bytes < goal_byte) {
|
||||
|
|
@ -247,56 +300,6 @@ void ts_lexer_delete(Lexer *self) {
|
|||
ts_free(self->included_ranges);
|
||||
}
|
||||
|
||||
static void ts_lexer_goto(Lexer *self, Length position) {
|
||||
self->current_position = position;
|
||||
bool found_included_range = false;
|
||||
|
||||
// Move to the first valid position at or after the given position.
|
||||
for (unsigned i = 0; i < self->included_range_count; i++) {
|
||||
TSRange *included_range = &self->included_ranges[i];
|
||||
if (included_range->end_byte > position.bytes) {
|
||||
if (included_range->start_byte > position.bytes) {
|
||||
self->current_position = (Length) {
|
||||
.bytes = included_range->start_byte,
|
||||
.extent = included_range->start_point,
|
||||
};
|
||||
}
|
||||
|
||||
self->current_included_range_index = i;
|
||||
found_included_range = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_included_range) {
|
||||
// If the current position is outside of the current chunk of text,
|
||||
// then clear out the current chunk of text.
|
||||
if (self->chunk && (
|
||||
position.bytes < self->chunk_start ||
|
||||
position.bytes >= self->chunk_start + self->chunk_size
|
||||
)) {
|
||||
ts_lexer__clear_chunk(self);
|
||||
}
|
||||
|
||||
self->lookahead_size = 0;
|
||||
self->data.lookahead = '\0';
|
||||
}
|
||||
|
||||
// If the given position is beyond any of included ranges, move to the EOF
|
||||
// state - past the end of the included ranges.
|
||||
else {
|
||||
self->current_included_range_index = self->included_range_count;
|
||||
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
|
||||
self->current_position = (Length) {
|
||||
.bytes = last_included_range->end_byte,
|
||||
.extent = last_included_range->end_point,
|
||||
};
|
||||
ts_lexer__clear_chunk(self);
|
||||
self->lookahead_size = 1;
|
||||
self->data.lookahead = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
void ts_lexer_set_input(Lexer *self, TSInput input) {
|
||||
self->input = input;
|
||||
ts_lexer__clear_chunk(self);
|
||||
|
|
|
|||
76
test/fixtures/test_grammars/uses_current_column/corpus.txt
vendored
Normal file
76
test/fixtures/test_grammars/uses_current_column/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
===============
|
||||
Simple blocks
|
||||
===============
|
||||
|
||||
do a
|
||||
e
|
||||
f
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)))
|
||||
(identifier))
|
||||
|
||||
=====================
|
||||
Nested blocks
|
||||
=====================
|
||||
|
||||
a = do b
|
||||
c + do e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(binary_expression
|
||||
(identifier)
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(binary_expression
|
||||
(identifier)
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)
|
||||
(identifier))))
|
||||
(identifier))))
|
||||
(identifier))
|
||||
|
||||
===============================
|
||||
Blocks with leading newlines
|
||||
===============================
|
||||
|
||||
do
|
||||
|
||||
|
||||
a = b
|
||||
do
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(do_expression (block
|
||||
(binary_expression (identifier) (identifier))
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)))
|
||||
(identifier)
|
||||
(identifier))))
|
||||
|
||||
=====================
|
||||
Unterminated blocks
|
||||
=====================
|
||||
|
||||
do
|
||||
---
|
||||
|
||||
(ERROR)
|
||||
69
test/fixtures/test_grammars/uses_current_column/grammar.json
vendored
Normal file
69
test/fixtures/test_grammars/uses_current_column/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"name": "uses_current_column",
|
||||
|
||||
"externals": [
|
||||
{"type": "SYMBOL", "name": "_indent"},
|
||||
{"type": "SYMBOL", "name": "_dedent"},
|
||||
{"type": "SYMBOL", "name": "_newline"}
|
||||
],
|
||||
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"block": {
|
||||
"type": "REPEAT1",
|
||||
"content": {"type": "SYMBOL", "name": "_statement"}
|
||||
},
|
||||
|
||||
"_statement": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{"type": "SYMBOL", "name": "_newline"}
|
||||
]
|
||||
},
|
||||
|
||||
"_expression": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "do_expression"},
|
||||
{"type": "SYMBOL", "name": "binary_expression"},
|
||||
{"type": "SYMBOL", "name": "identifier"}
|
||||
]
|
||||
},
|
||||
|
||||
"do_expression": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "do"},
|
||||
{"type": "SYMBOL", "name": "_indent"},
|
||||
{"type": "SYMBOL", "name": "block"},
|
||||
{"type": "SYMBOL", "name": "_dedent"}
|
||||
]
|
||||
},
|
||||
|
||||
"binary_expression": {
|
||||
"type": "PREC_LEFT",
|
||||
"value": 1,
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "="},
|
||||
{"type": "STRING", "value": "+"},
|
||||
{"type": "STRING", "value": "-"}
|
||||
]
|
||||
},
|
||||
{"type": "SYMBOL", "name": "_expression"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
"identifier": {"type": "PATTERN", "value": "\\w+"}
|
||||
}
|
||||
}
|
||||
133
test/fixtures/test_grammars/uses_current_column/scanner.c
vendored
Normal file
133
test/fixtures/test_grammars/uses_current_column/scanner.c
vendored
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
#include <stdlib.h>
|
||||
#include <wctype.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
|
||||
enum TokenType {
|
||||
INDENT,
|
||||
DEDENT,
|
||||
NEWLINE,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t queued_dedent_count;
|
||||
uint8_t indent_count;
|
||||
int8_t indents[32];
|
||||
} Scanner;
|
||||
|
||||
void *tree_sitter_uses_current_column_external_scanner_create() {
|
||||
Scanner *self = malloc(sizeof(Scanner));
|
||||
self->queued_dedent_count = 0;
|
||||
self->indent_count = 1;
|
||||
self->indents[0] = 0;
|
||||
return (void *)self;
|
||||
}
|
||||
|
||||
void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) {
|
||||
free(payload);
|
||||
}
|
||||
|
||||
unsigned tree_sitter_uses_current_column_external_scanner_serialize(
|
||||
void *payload,
|
||||
char *buffer
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
buffer[0] = self->queued_dedent_count;
|
||||
for (unsigned i = 0; i < self->indent_count; i++) {
|
||||
buffer[i + 1] = self->indents[i];
|
||||
}
|
||||
return self->indent_count + 1;
|
||||
}
|
||||
|
||||
void tree_sitter_uses_current_column_external_scanner_deserialize(
|
||||
void *payload,
|
||||
const char *buffer,
|
||||
unsigned length
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
if (length > 0) {
|
||||
self->queued_dedent_count = buffer[0];
|
||||
self->indent_count = length - 1;
|
||||
for (unsigned i = 0; i < self->indent_count; i++) {
|
||||
self->indents[i] = buffer[i + 1];
|
||||
}
|
||||
} else {
|
||||
self->queued_dedent_count = 0;
|
||||
self->indent_count = 1;
|
||||
self->indents[0] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool tree_sitter_uses_current_column_external_scanner_scan(
|
||||
void *payload,
|
||||
TSLexer *lexer,
|
||||
const bool *valid_symbols
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
// If dedents were found in a previous run, and are valid now,
|
||||
// then return a dedent.
|
||||
if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) {
|
||||
lexer->result_symbol = DEDENT;
|
||||
self->queued_dedent_count--;
|
||||
return true;
|
||||
}
|
||||
|
||||
// If an indent is valid, then add an entry to the indent stack
|
||||
// for the current column, and return an indent.
|
||||
if (valid_symbols[INDENT]) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
uint32_t column = lexer->get_column(lexer);
|
||||
if (column > self->indents[self->indent_count - 1]) {
|
||||
self->indents[self->indent_count++] = column - 2;
|
||||
lexer->result_symbol = INDENT;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// If at the end of a statement, then get the current indent
|
||||
// level and pop some number of entries off of the indent stack.
|
||||
if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) {
|
||||
while (lexer->lookahead == ' ') {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
if (lexer->lookahead == '\n') {
|
||||
lexer->advance(lexer, false);
|
||||
|
||||
uint32_t next_column = 0;
|
||||
for (;;) {
|
||||
if (lexer->lookahead == ' ') {
|
||||
next_column++;
|
||||
lexer->advance(lexer, false);
|
||||
} else if (lexer->lookahead == '\n') {
|
||||
next_column = 0;
|
||||
lexer->advance(lexer, false);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned dedent_count = 0;
|
||||
while (next_column < self->indents[self->indent_count - 1]) {
|
||||
dedent_count++;
|
||||
self->indent_count--;
|
||||
}
|
||||
|
||||
if (dedent_count > 0 && valid_symbols[DEDENT]) {
|
||||
lexer->result_symbol = DEDENT;
|
||||
return true;
|
||||
} else if (valid_symbols[NEWLINE]) {
|
||||
self->queued_dedent_count += dedent_count;
|
||||
lexer->result_symbol = NEWLINE;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue