From e29d3714f7ee821bb717ad4222bf5280ec7a67a9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 11 Mar 2021 11:25:10 -0800 Subject: [PATCH] Fix behavior of Lexer.get_column when at EOF --- lib/src/lexer.c | 115 +++++++-------- .../uses_current_column/corpus.txt | 76 ++++++++++ .../uses_current_column/grammar.json | 69 +++++++++ .../uses_current_column/scanner.c | 133 ++++++++++++++++++ 4 files changed, 337 insertions(+), 56 deletions(-) create mode 100644 test/fixtures/test_grammars/uses_current_column/corpus.txt create mode 100644 test/fixtures/test_grammars/uses_current_column/grammar.json create mode 100644 test/fixtures/test_grammars/uses_current_column/scanner.c diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 08e90a8c..f349d76f 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -102,6 +102,56 @@ static void ts_lexer__get_lookahead(Lexer *self) { } } +static void ts_lexer_goto(Lexer *self, Length position) { + self->current_position = position; + bool found_included_range = false; + + // Move to the first valid position at or after the given position. + for (unsigned i = 0; i < self->included_range_count; i++) { + TSRange *included_range = &self->included_ranges[i]; + if (included_range->end_byte > position.bytes) { + if (included_range->start_byte > position.bytes) { + self->current_position = (Length) { + .bytes = included_range->start_byte, + .extent = included_range->start_point, + }; + } + + self->current_included_range_index = i; + found_included_range = true; + break; + } + } + + if (found_included_range) { + // If the current position is outside of the current chunk of text, + // then clear out the current chunk of text. + if (self->chunk && ( + position.bytes < self->chunk_start || + position.bytes >= self->chunk_start + self->chunk_size + )) { + ts_lexer__clear_chunk(self); + } + + self->lookahead_size = 0; + self->data.lookahead = '\0'; + } + + // If the given position is beyond any of included ranges, move to the EOF + // state - past the end of the included ranges. + else { + self->current_included_range_index = self->included_range_count; + TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; + self->current_position = (Length) { + .bytes = last_included_range->end_byte, + .extent = last_included_range->end_point, + }; + ts_lexer__clear_chunk(self); + self->lookahead_size = 1; + self->data.lookahead = '\0'; + } +} + // Advance to the next character in the source code, retrieving a new // chunk of source code if needed. static void ts_lexer__advance(TSLexer *_self, bool skip) { @@ -185,12 +235,15 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; uint32_t goal_byte = self->current_position.bytes; - self->current_position.bytes -= self->current_position.extent.column; - self->current_position.extent.column = 0; - - if (self->current_position.bytes < self->chunk_start) { - ts_lexer__get_chunk(self); - } + ts_lexer_goto(self, (Length) { + .bytes = self->current_position.bytes - self->current_position.extent.column, + .extent = { + .row = self->current_position.extent.row, + .column = 0, + } + }); + if (!self->chunk_size) ts_lexer__get_chunk(self); + if (!self->lookahead_size) ts_lexer__get_lookahead(self); uint32_t result = 0; while (self->current_position.bytes < goal_byte) { @@ -247,56 +300,6 @@ void ts_lexer_delete(Lexer *self) { ts_free(self->included_ranges); } -static void ts_lexer_goto(Lexer *self, Length position) { - self->current_position = position; - bool found_included_range = false; - - // Move to the first valid position at or after the given position. - for (unsigned i = 0; i < self->included_range_count; i++) { - TSRange *included_range = &self->included_ranges[i]; - if (included_range->end_byte > position.bytes) { - if (included_range->start_byte > position.bytes) { - self->current_position = (Length) { - .bytes = included_range->start_byte, - .extent = included_range->start_point, - }; - } - - self->current_included_range_index = i; - found_included_range = true; - break; - } - } - - if (found_included_range) { - // If the current position is outside of the current chunk of text, - // then clear out the current chunk of text. - if (self->chunk && ( - position.bytes < self->chunk_start || - position.bytes >= self->chunk_start + self->chunk_size - )) { - ts_lexer__clear_chunk(self); - } - - self->lookahead_size = 0; - self->data.lookahead = '\0'; - } - - // If the given position is beyond any of included ranges, move to the EOF - // state - past the end of the included ranges. - else { - self->current_included_range_index = self->included_range_count; - TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; - self->current_position = (Length) { - .bytes = last_included_range->end_byte, - .extent = last_included_range->end_point, - }; - ts_lexer__clear_chunk(self); - self->lookahead_size = 1; - self->data.lookahead = '\0'; - } -} - void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__clear_chunk(self); diff --git a/test/fixtures/test_grammars/uses_current_column/corpus.txt b/test/fixtures/test_grammars/uses_current_column/corpus.txt new file mode 100644 index 00000000..9638e25e --- /dev/null +++ b/test/fixtures/test_grammars/uses_current_column/corpus.txt @@ -0,0 +1,76 @@ +=============== +Simple blocks +=============== + +do a + e +f + +--- + +(block + (do_expression (block + (identifier) + (identifier))) + (identifier)) + +===================== +Nested blocks +===================== + +a = do b + c + do e + f + g + h +i + +--- + +(block + (binary_expression + (identifier) + (do_expression (block + (identifier) + (binary_expression + (identifier) + (do_expression (block + (identifier) + (identifier) + (identifier)))) + (identifier)))) + (identifier)) + +=============================== +Blocks with leading newlines +=============================== + +do + + + a = b + do + c + d + e + f + +--- + +(block + (do_expression (block + (binary_expression (identifier) (identifier)) + (do_expression (block + (identifier) + (identifier))) + (identifier) + (identifier)))) + +===================== +Unterminated blocks +===================== + +do +--- + +(ERROR) diff --git a/test/fixtures/test_grammars/uses_current_column/grammar.json b/test/fixtures/test_grammars/uses_current_column/grammar.json new file mode 100644 index 00000000..90c740b6 --- /dev/null +++ b/test/fixtures/test_grammars/uses_current_column/grammar.json @@ -0,0 +1,69 @@ +{ + "name": "uses_current_column", + + "externals": [ + {"type": "SYMBOL", "name": "_indent"}, + {"type": "SYMBOL", "name": "_dedent"}, + {"type": "SYMBOL", "name": "_newline"} + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "block": { + "type": "REPEAT1", + "content": {"type": "SYMBOL", "name": "_statement"} + }, + + "_statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_newline"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "do_expression"}, + {"type": "SYMBOL", "name": "binary_expression"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "do_expression": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "do"}, + {"type": "SYMBOL", "name": "_indent"}, + {"type": "SYMBOL", "name": "block"}, + {"type": "SYMBOL", "name": "_dedent"} + ] + }, + + "binary_expression": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + { + "type": "CHOICE", + "members": [ + {"type": "STRING", "value": "="}, + {"type": "STRING", "value": "+"}, + {"type": "STRING", "value": "-"} + ] + }, + {"type": "SYMBOL", "name": "_expression"} + ] + } + }, + + "identifier": {"type": "PATTERN", "value": "\\w+"} + } +} diff --git a/test/fixtures/test_grammars/uses_current_column/scanner.c b/test/fixtures/test_grammars/uses_current_column/scanner.c new file mode 100644 index 00000000..efd27f9f --- /dev/null +++ b/test/fixtures/test_grammars/uses_current_column/scanner.c @@ -0,0 +1,133 @@ +#include +#include +#include + +enum TokenType { + INDENT, + DEDENT, + NEWLINE, +}; + +typedef struct { + uint8_t queued_dedent_count; + uint8_t indent_count; + int8_t indents[32]; +} Scanner; + +void *tree_sitter_uses_current_column_external_scanner_create() { + Scanner *self = malloc(sizeof(Scanner)); + self->queued_dedent_count = 0; + self->indent_count = 1; + self->indents[0] = 0; + return (void *)self; +} + +void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) { + free(payload); +} + +unsigned tree_sitter_uses_current_column_external_scanner_serialize( + void *payload, + char *buffer +) { + Scanner *self = (Scanner *)payload; + buffer[0] = self->queued_dedent_count; + for (unsigned i = 0; i < self->indent_count; i++) { + buffer[i + 1] = self->indents[i]; + } + return self->indent_count + 1; +} + +void tree_sitter_uses_current_column_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) { + Scanner *self = (Scanner *)payload; + if (length > 0) { + self->queued_dedent_count = buffer[0]; + self->indent_count = length - 1; + for (unsigned i = 0; i < self->indent_count; i++) { + self->indents[i] = buffer[i + 1]; + } + } else { + self->queued_dedent_count = 0; + self->indent_count = 1; + self->indents[0] = 0; + } +} + +bool tree_sitter_uses_current_column_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + Scanner *self = (Scanner *)payload; + lexer->mark_end(lexer); + + // If dedents were found in a previous run, and are valid now, + // then return a dedent. + if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) { + lexer->result_symbol = DEDENT; + self->queued_dedent_count--; + return true; + } + + // If an indent is valid, then add an entry to the indent stack + // for the current column, and return an indent. + if (valid_symbols[INDENT]) { + while (iswspace(lexer->lookahead)) { + lexer->advance(lexer, false); + } + uint32_t column = lexer->get_column(lexer); + if (column > self->indents[self->indent_count - 1]) { + self->indents[self->indent_count++] = column - 2; + lexer->result_symbol = INDENT; + return true; + } else { + return false; + } + } + + // If at the end of a statement, then get the current indent + // level and pop some number of entries off of the indent stack. + if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, false); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + + uint32_t next_column = 0; + for (;;) { + if (lexer->lookahead == ' ') { + next_column++; + lexer->advance(lexer, false); + } else if (lexer->lookahead == '\n') { + next_column = 0; + lexer->advance(lexer, false); + } else { + break; + } + } + + unsigned dedent_count = 0; + while (next_column < self->indents[self->indent_count - 1]) { + dedent_count++; + self->indent_count--; + } + + if (dedent_count > 0 && valid_symbols[DEDENT]) { + lexer->result_symbol = DEDENT; + return true; + } else if (valid_symbols[NEWLINE]) { + self->queued_dedent_count += dedent_count; + lexer->result_symbol = NEWLINE; + return true; + } + } + } + + return false; +}