Merge pull request #978 from tree-sitter/fix-get-column-at-eof

Fix the behavior of Lexer.get_column
This commit is contained in:
Max Brunsfeld 2021-03-11 16:42:39 -08:00 committed by GitHub
commit d366356299
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 463 additions and 81 deletions

View file

@ -74,3 +74,9 @@ pub fn get_test_language(name: &str, parser_code: &str, path: Option<&Path>) ->
.load_language_from_sources(name, &HEADER_DIR, &parser_c_path, &scanner_path)
.unwrap()
}
pub fn get_test_grammar(name: &str) -> (String, Option<PathBuf>) {
let dir = fixtures_dir().join("test_grammars").join(name);
let grammar = fs::read_to_string(&dir.join("grammar.json")).unwrap();
(grammar, Some(dir))
}

View file

@ -1,5 +1,5 @@
use super::helpers::edits::ReadRecorder;
use super::helpers::fixtures::{get_language, get_test_language};
use super::helpers::fixtures::{get_language, get_test_grammar, get_test_language};
use crate::generate::generate_parser_for_grammar;
use crate::parse::{perform_edit, Edit};
use std::sync::atomic::{AtomicUsize, Ordering};
@ -406,6 +406,83 @@ fn test_parsing_empty_file_with_reused_tree() {
parser.parse("\n ", tree.as_ref());
}
#[test]
fn test_parsing_after_editing_tree_that_depends_on_column_values() {
let (grammar, path) = get_test_grammar("uses_current_column");
let (grammar_name, parser_code) = generate_parser_for_grammar(&grammar).unwrap();
let mut parser = Parser::new();
parser
.set_language(get_test_language(
&grammar_name,
&parser_code,
path.as_ref().map(AsRef::as_ref),
))
.unwrap();
let mut code = b"
a = b
c = do d
e + f
g
h + i
"
.to_vec();
let mut tree = parser.parse(&code, None).unwrap();
assert_eq!(
tree.root_node().to_sexp(),
concat!(
"(block ",
"(binary_expression (identifier) (identifier)) ",
"(binary_expression (identifier) (do_expression (block (identifier) (binary_expression (identifier) (identifier)) (identifier)))) ",
"(binary_expression (identifier) (identifier)))",
)
);
perform_edit(
&mut tree,
&mut code,
&Edit {
position: 8,
deleted_length: 0,
inserted_text: b"1234".to_vec(),
},
);
assert_eq!(
code,
b"
a = b
c1234 = do d
e + f
g
h + i
"
);
let mut recorder = ReadRecorder::new(&code);
let tree = parser
.parse_with(&mut |i, _| recorder.read(i), Some(&tree))
.unwrap();
assert_eq!(
tree.root_node().to_sexp(),
concat!(
"(block ",
"(binary_expression (identifier) (identifier)) ",
"(binary_expression (identifier) (do_expression (block (identifier)))) ",
"(binary_expression (identifier) (identifier)) ",
"(identifier) ",
"(binary_expression (identifier) (identifier)))",
)
);
assert_eq!(
recorder.strings_read(),
vec!["\nc1234 = do d\n e + f\n g\n"]
);
}
// Thread safety
#[test]

View file

@ -102,6 +102,56 @@ static void ts_lexer__get_lookahead(Lexer *self) {
}
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
// Advance to the next character in the source code, retrieving a new
// chunk of source code if needed.
static void ts_lexer__advance(TSLexer *_self, bool skip) {
@ -183,22 +233,8 @@ static void ts_lexer__mark_end(TSLexer *_self) {
static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->current_position.bytes -= self->current_position.extent.column;
self->current_position.extent.column = 0;
if (self->current_position.bytes < self->chunk_start) {
ts_lexer__get_chunk(self);
}
uint32_t result = 0;
while (self->current_position.bytes < goal_byte) {
ts_lexer__advance(&self->data, false);
result++;
}
return result;
self->did_get_column = true;
return self->current_position.extent.column;
}
// Is the lexer at a boundary between two disjoint included ranges of
@ -247,56 +283,6 @@ void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
ts_lexer__clear_chunk(self);
@ -315,6 +301,7 @@ void ts_lexer_start(Lexer *self) {
self->token_start_position = self->current_position;
self->token_end_position = LENGTH_UNDEFINED;
self->data.result_symbol = 0;
self->did_get_column = false;
if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);

View file

@ -17,16 +17,17 @@ typedef struct {
Length token_end_position;
TSRange *included_ranges;
size_t included_range_count;
size_t current_included_range_index;
const char *chunk;
TSInput input;
TSLogger logger;
uint32_t included_range_count;
uint32_t current_included_range_index;
uint32_t chunk_start;
uint32_t chunk_size;
uint32_t lookahead_size;
bool did_get_column;
TSInput input;
TSLogger logger;
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
} Lexer;

View file

@ -403,6 +403,7 @@ static Subtree ts_parser__lex(
bool found_external_token = false;
bool error_mode = parse_state == ERROR_STATE;
bool skipped_error = false;
bool called_get_column = false;
int32_t first_error_character = 0;
Length error_start_position = length_zero();
Length error_end_position = length_zero();
@ -445,6 +446,7 @@ static Subtree ts_parser__lex(
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
)) {
found_external_token = true;
called_get_column = self->lexer.did_get_column;
break;
}
@ -546,6 +548,7 @@ static Subtree ts_parser__lex(
lookahead_bytes,
parse_state,
found_external_token,
called_get_column,
is_keyword,
self->language
);

View file

@ -166,7 +166,8 @@ static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t l
Subtree ts_subtree_new_leaf(
SubtreePool *pool, TSSymbol symbol, Length padding, Length size,
uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens,
uint32_t lookahead_bytes, TSStateId parse_state,
bool has_external_tokens, bool depends_on_column,
bool is_keyword, const TSLanguage *language
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
@ -213,6 +214,7 @@ Subtree ts_subtree_new_leaf(
.fragile_right = false,
.has_changes = false,
.has_external_tokens = has_external_tokens,
.depends_on_column = depends_on_column,
.is_missing = false,
.is_keyword = is_keyword,
{{.first_leaf = {.symbol = 0, .parse_state = 0}}}
@ -245,7 +247,7 @@ Subtree ts_subtree_new_error(
) {
Subtree result = ts_subtree_new_leaf(
pool, ts_builtin_sym_error, padding, size, bytes_scanned,
parse_state, false, false, language
parse_state, false, false, false, language
);
SubtreeHeapData *data = (SubtreeHeapData *)result.ptr;
data->fragile_left = true;
@ -378,6 +380,7 @@ void ts_subtree_summarize_children(
self.ptr->repeat_depth = 0;
self.ptr->node_count = 1;
self.ptr->has_external_tokens = false;
self.ptr->depends_on_column = false;
self.ptr->dynamic_precedence = 0;
uint32_t structural_index = 0;
@ -388,6 +391,13 @@ void ts_subtree_summarize_children(
for (uint32_t i = 0; i < self.ptr->child_count; i++) {
Subtree child = children[i];
if (
self.ptr->size.extent.row == 0 &&
ts_subtree_depends_on_column(child)
) {
self.ptr->depends_on_column = true;
}
if (i == 0) {
self.ptr->padding = ts_subtree_padding(child);
self.ptr->size = ts_subtree_size(child);
@ -545,7 +555,7 @@ Subtree ts_subtree_new_missing_leaf(
) {
Subtree result = ts_subtree_new_leaf(
pool, symbol, padding, length_zero(), 0,
0, false, false, language
0, false, false, false, language
);
if (result.data.is_inline) {
result.data.is_missing = true;
@ -670,6 +680,7 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool
Edit edit = entry.edit;
bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes;
bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes;
bool invalidate_first_row = ts_subtree_depends_on_column(*entry.tree);
Length size = ts_subtree_size(*entry.tree);
Length padding = ts_subtree_padding(*entry.tree);
@ -733,6 +744,7 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool
data->fragile_right = false;
data->has_changes = false;
data->has_external_tokens = false;
data->depends_on_column = false;
data->is_missing = result.data.is_missing;
data->is_keyword = result.data.is_keyword;
result.ptr = data;
@ -755,9 +767,18 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool
// If this child ends before the edit, it is not affected.
if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue;
// If this child starts after the edit, then we're done processing children.
if (child_left.bytes > edit.old_end.bytes ||
(child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)) break;
// Keep editing child nodes until a node is reached that starts after the edit.
// Also, if this node's validity depends on its column position, then continue
// invaliditing child nodes until reaching a line break.
if ((
(child_left.bytes > edit.old_end.bytes) ||
(child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)
) && (
!invalidate_first_row ||
child_left.extent.row > entry.tree->ptr->padding.extent.row
)) {
break;
}
// Transform edit into the child's coordinate space.
Edit child_edit = {
@ -775,8 +796,10 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool
// Interpret all inserted text as applying to the *first* child that touches the edit.
// Subsequent children are only never have any text inserted into them; they are only
// shrunk to compensate for the edit.
if (child_right.bytes > edit.start.bytes ||
(child_right.bytes == edit.start.bytes && is_pure_insertion)) {
if (
child_right.bytes > edit.start.bytes ||
(child_right.bytes == edit.start.bytes && is_pure_insertion)
) {
edit.new_end = edit.start;
}
@ -981,12 +1004,14 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset,
"state: %d\n"
"error-cost: %u\n"
"has-changes: %u\n"
"depends-on-column: %u\n"
"repeat-depth: %u\n"
"lookahead-bytes: %u",
start_offset, end_offset,
ts_subtree_parse_state(*self),
ts_subtree_error_cost(*self),
ts_subtree_has_changes(*self),
ts_subtree_depends_on_column(*self),
ts_subtree_repeat_depth(*self),
ts_subtree_lookahead_bytes(*self)
);

View file

@ -78,6 +78,7 @@ typedef struct {
bool fragile_right : 1;
bool has_changes : 1;
bool has_external_tokens : 1;
bool depends_on_column: 1;
bool is_missing : 1;
bool is_keyword : 1;
@ -138,7 +139,7 @@ void ts_subtree_pool_delete(SubtreePool *);
Subtree ts_subtree_new_leaf(
SubtreePool *, TSSymbol, Length, Length, uint32_t,
TSStateId, bool, bool, const TSLanguage *
TSStateId, bool, bool, bool, const TSLanguage *
);
Subtree ts_subtree_new_error(
SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage *
@ -284,6 +285,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_tokens;
}
static inline bool ts_subtree_depends_on_column(Subtree self) {
return self.data.is_inline ? false : self.ptr->depends_on_column;
}
static inline bool ts_subtree_is_fragile(Subtree self) {
return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right);
}

View file

@ -0,0 +1,76 @@
===============
Simple blocks
===============
do a
e
f
---
(block
(do_expression (block
(identifier)
(identifier)))
(identifier))
=====================
Nested blocks
=====================
a = do b
c + do e
f
g
h
i
---
(block
(binary_expression
(identifier)
(do_expression (block
(identifier)
(binary_expression
(identifier)
(do_expression (block
(identifier)
(identifier)
(identifier))))
(identifier))))
(identifier))
===============================
Blocks with leading newlines
===============================
do
a = b
do
c
d
e
f
---
(block
(do_expression (block
(binary_expression (identifier) (identifier))
(do_expression (block
(identifier)
(identifier)))
(identifier)
(identifier))))
=====================
Unterminated blocks
=====================
do
---
(ERROR)

View file

@ -0,0 +1,69 @@
{
"name": "uses_current_column",
"externals": [
{"type": "SYMBOL", "name": "_indent"},
{"type": "SYMBOL", "name": "_dedent"},
{"type": "SYMBOL", "name": "_newline"}
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"block": {
"type": "REPEAT1",
"content": {"type": "SYMBOL", "name": "_statement"}
},
"_statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "_newline"}
]
},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "do_expression"},
{"type": "SYMBOL", "name": "binary_expression"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"do_expression": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "do"},
{"type": "SYMBOL", "name": "_indent"},
{"type": "SYMBOL", "name": "block"},
{"type": "SYMBOL", "name": "_dedent"}
]
},
"binary_expression": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{
"type": "CHOICE",
"members": [
{"type": "STRING", "value": "="},
{"type": "STRING", "value": "+"},
{"type": "STRING", "value": "-"}
]
},
{"type": "SYMBOL", "name": "_expression"}
]
}
},
"identifier": {"type": "PATTERN", "value": "\\w+"}
}
}

View file

@ -0,0 +1,133 @@
#include <stdlib.h>
#include <wctype.h>
#include <tree_sitter/parser.h>
enum TokenType {
INDENT,
DEDENT,
NEWLINE,
};
typedef struct {
uint8_t queued_dedent_count;
uint8_t indent_count;
int8_t indents[32];
} Scanner;
void *tree_sitter_uses_current_column_external_scanner_create() {
Scanner *self = malloc(sizeof(Scanner));
self->queued_dedent_count = 0;
self->indent_count = 1;
self->indents[0] = 0;
return (void *)self;
}
void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) {
free(payload);
}
unsigned tree_sitter_uses_current_column_external_scanner_serialize(
void *payload,
char *buffer
) {
Scanner *self = (Scanner *)payload;
buffer[0] = self->queued_dedent_count;
for (unsigned i = 0; i < self->indent_count; i++) {
buffer[i + 1] = self->indents[i];
}
return self->indent_count + 1;
}
void tree_sitter_uses_current_column_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {
Scanner *self = (Scanner *)payload;
if (length > 0) {
self->queued_dedent_count = buffer[0];
self->indent_count = length - 1;
for (unsigned i = 0; i < self->indent_count; i++) {
self->indents[i] = buffer[i + 1];
}
} else {
self->queued_dedent_count = 0;
self->indent_count = 1;
self->indents[0] = 0;
}
}
bool tree_sitter_uses_current_column_external_scanner_scan(
void *payload,
TSLexer *lexer,
const bool *valid_symbols
) {
Scanner *self = (Scanner *)payload;
lexer->mark_end(lexer);
// If dedents were found in a previous run, and are valid now,
// then return a dedent.
if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) {
lexer->result_symbol = DEDENT;
self->queued_dedent_count--;
return true;
}
// If an indent is valid, then add an entry to the indent stack
// for the current column, and return an indent.
if (valid_symbols[INDENT]) {
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, false);
}
uint32_t column = lexer->get_column(lexer);
if (column > self->indents[self->indent_count - 1]) {
self->indents[self->indent_count++] = column - 2;
lexer->result_symbol = INDENT;
return true;
} else {
return false;
}
}
// If at the end of a statement, then get the current indent
// level and pop some number of entries off of the indent stack.
if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
lexer->advance(lexer, false);
}
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
uint32_t next_column = 0;
for (;;) {
if (lexer->lookahead == ' ') {
next_column++;
lexer->advance(lexer, false);
} else if (lexer->lookahead == '\n') {
next_column = 0;
lexer->advance(lexer, false);
} else {
break;
}
}
unsigned dedent_count = 0;
while (next_column < self->indents[self->indent_count - 1]) {
dedent_count++;
self->indent_count--;
}
if (dedent_count > 0 && valid_symbols[DEDENT]) {
lexer->result_symbol = DEDENT;
return true;
} else if (valid_symbols[NEWLINE]) {
self->queued_dedent_count += dedent_count;
lexer->result_symbol = NEWLINE;
return true;
}
}
}
return false;
}