Fix behavior of Lexer.get_column when at EOF
This commit is contained in:
parent
57036b4f8a
commit
e29d3714f7
4 changed files with 337 additions and 56 deletions
76
test/fixtures/test_grammars/uses_current_column/corpus.txt
vendored
Normal file
76
test/fixtures/test_grammars/uses_current_column/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
===============
|
||||
Simple blocks
|
||||
===============
|
||||
|
||||
do a
|
||||
e
|
||||
f
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)))
|
||||
(identifier))
|
||||
|
||||
=====================
|
||||
Nested blocks
|
||||
=====================
|
||||
|
||||
a = do b
|
||||
c + do e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(binary_expression
|
||||
(identifier)
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(binary_expression
|
||||
(identifier)
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)
|
||||
(identifier))))
|
||||
(identifier))))
|
||||
(identifier))
|
||||
|
||||
===============================
|
||||
Blocks with leading newlines
|
||||
===============================
|
||||
|
||||
do
|
||||
|
||||
|
||||
a = b
|
||||
do
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
|
||||
---
|
||||
|
||||
(block
|
||||
(do_expression (block
|
||||
(binary_expression (identifier) (identifier))
|
||||
(do_expression (block
|
||||
(identifier)
|
||||
(identifier)))
|
||||
(identifier)
|
||||
(identifier))))
|
||||
|
||||
=====================
|
||||
Unterminated blocks
|
||||
=====================
|
||||
|
||||
do
|
||||
---
|
||||
|
||||
(ERROR)
|
||||
69
test/fixtures/test_grammars/uses_current_column/grammar.json
vendored
Normal file
69
test/fixtures/test_grammars/uses_current_column/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"name": "uses_current_column",
|
||||
|
||||
"externals": [
|
||||
{"type": "SYMBOL", "name": "_indent"},
|
||||
{"type": "SYMBOL", "name": "_dedent"},
|
||||
{"type": "SYMBOL", "name": "_newline"}
|
||||
],
|
||||
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"block": {
|
||||
"type": "REPEAT1",
|
||||
"content": {"type": "SYMBOL", "name": "_statement"}
|
||||
},
|
||||
|
||||
"_statement": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{"type": "SYMBOL", "name": "_newline"}
|
||||
]
|
||||
},
|
||||
|
||||
"_expression": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "do_expression"},
|
||||
{"type": "SYMBOL", "name": "binary_expression"},
|
||||
{"type": "SYMBOL", "name": "identifier"}
|
||||
]
|
||||
},
|
||||
|
||||
"do_expression": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "do"},
|
||||
{"type": "SYMBOL", "name": "_indent"},
|
||||
{"type": "SYMBOL", "name": "block"},
|
||||
{"type": "SYMBOL", "name": "_dedent"}
|
||||
]
|
||||
},
|
||||
|
||||
"binary_expression": {
|
||||
"type": "PREC_LEFT",
|
||||
"value": 1,
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "="},
|
||||
{"type": "STRING", "value": "+"},
|
||||
{"type": "STRING", "value": "-"}
|
||||
]
|
||||
},
|
||||
{"type": "SYMBOL", "name": "_expression"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
"identifier": {"type": "PATTERN", "value": "\\w+"}
|
||||
}
|
||||
}
|
||||
133
test/fixtures/test_grammars/uses_current_column/scanner.c
vendored
Normal file
133
test/fixtures/test_grammars/uses_current_column/scanner.c
vendored
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
#include <stdlib.h>
|
||||
#include <wctype.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
|
||||
enum TokenType {
|
||||
INDENT,
|
||||
DEDENT,
|
||||
NEWLINE,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t queued_dedent_count;
|
||||
uint8_t indent_count;
|
||||
int8_t indents[32];
|
||||
} Scanner;
|
||||
|
||||
void *tree_sitter_uses_current_column_external_scanner_create() {
|
||||
Scanner *self = malloc(sizeof(Scanner));
|
||||
self->queued_dedent_count = 0;
|
||||
self->indent_count = 1;
|
||||
self->indents[0] = 0;
|
||||
return (void *)self;
|
||||
}
|
||||
|
||||
void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) {
|
||||
free(payload);
|
||||
}
|
||||
|
||||
unsigned tree_sitter_uses_current_column_external_scanner_serialize(
|
||||
void *payload,
|
||||
char *buffer
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
buffer[0] = self->queued_dedent_count;
|
||||
for (unsigned i = 0; i < self->indent_count; i++) {
|
||||
buffer[i + 1] = self->indents[i];
|
||||
}
|
||||
return self->indent_count + 1;
|
||||
}
|
||||
|
||||
void tree_sitter_uses_current_column_external_scanner_deserialize(
|
||||
void *payload,
|
||||
const char *buffer,
|
||||
unsigned length
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
if (length > 0) {
|
||||
self->queued_dedent_count = buffer[0];
|
||||
self->indent_count = length - 1;
|
||||
for (unsigned i = 0; i < self->indent_count; i++) {
|
||||
self->indents[i] = buffer[i + 1];
|
||||
}
|
||||
} else {
|
||||
self->queued_dedent_count = 0;
|
||||
self->indent_count = 1;
|
||||
self->indents[0] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool tree_sitter_uses_current_column_external_scanner_scan(
|
||||
void *payload,
|
||||
TSLexer *lexer,
|
||||
const bool *valid_symbols
|
||||
) {
|
||||
Scanner *self = (Scanner *)payload;
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
// If dedents were found in a previous run, and are valid now,
|
||||
// then return a dedent.
|
||||
if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) {
|
||||
lexer->result_symbol = DEDENT;
|
||||
self->queued_dedent_count--;
|
||||
return true;
|
||||
}
|
||||
|
||||
// If an indent is valid, then add an entry to the indent stack
|
||||
// for the current column, and return an indent.
|
||||
if (valid_symbols[INDENT]) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
uint32_t column = lexer->get_column(lexer);
|
||||
if (column > self->indents[self->indent_count - 1]) {
|
||||
self->indents[self->indent_count++] = column - 2;
|
||||
lexer->result_symbol = INDENT;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// If at the end of a statement, then get the current indent
|
||||
// level and pop some number of entries off of the indent stack.
|
||||
if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) {
|
||||
while (lexer->lookahead == ' ') {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
if (lexer->lookahead == '\n') {
|
||||
lexer->advance(lexer, false);
|
||||
|
||||
uint32_t next_column = 0;
|
||||
for (;;) {
|
||||
if (lexer->lookahead == ' ') {
|
||||
next_column++;
|
||||
lexer->advance(lexer, false);
|
||||
} else if (lexer->lookahead == '\n') {
|
||||
next_column = 0;
|
||||
lexer->advance(lexer, false);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned dedent_count = 0;
|
||||
while (next_column < self->indents[self->indent_count - 1]) {
|
||||
dedent_count++;
|
||||
self->indent_count--;
|
||||
}
|
||||
|
||||
if (dedent_count > 0 && valid_symbols[DEDENT]) {
|
||||
lexer->result_symbol = DEDENT;
|
||||
return true;
|
||||
} else if (valid_symbols[NEWLINE]) {
|
||||
self->queued_dedent_count += dedent_count;
|
||||
lexer->result_symbol = NEWLINE;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue