Allow lexer to accept tokens that ended at previous positions
* Track lookahead in each tree * Add 'mark_end' API that external scanners can use
This commit is contained in:
parent
12d2a9d93f
commit
d222dbb9fd
12 changed files with 96 additions and 71 deletions
|
|
@ -26,6 +26,7 @@ typedef struct {
|
|||
|
||||
typedef struct {
|
||||
void (*advance)(void *, bool);
|
||||
void (*mark_end)(void *);
|
||||
int32_t lookahead;
|
||||
TSSymbol result_symbol;
|
||||
} TSLexer;
|
||||
|
|
@ -91,32 +92,32 @@ typedef struct TSLanguage {
|
|||
* Lexer Macros
|
||||
*/
|
||||
|
||||
#define START_LEXER() \
|
||||
int32_t lookahead; \
|
||||
next_state: \
|
||||
#define START_LEXER() \
|
||||
bool result = false; \
|
||||
int32_t lookahead; \
|
||||
next_state: \
|
||||
lookahead = lexer->lookahead;
|
||||
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
lexer->advance(lexer, false); \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
lexer->advance(lexer, true); \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ACCEPT_TOKEN(symbol_value) \
|
||||
{ \
|
||||
lexer->result_symbol = symbol_value; \
|
||||
return true; \
|
||||
}
|
||||
#define ACCEPT_TOKEN(symbol_value) \
|
||||
result = true; \
|
||||
lexer->result_symbol = symbol_value; \
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
#define LEX_ERROR() return false
|
||||
#define END_STATE() return result;
|
||||
|
||||
/*
|
||||
* Parse Table Macros
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ extern "C" {
|
|||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 1
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 2
|
||||
|
||||
typedef unsigned short TSSymbol;
|
||||
typedef struct TSLanguage TSLanguage;
|
||||
|
|
|
|||
|
|
@ -217,9 +217,10 @@ class CCodeGenerator {
|
|||
line("START_LEXER();");
|
||||
_switch("state", [&]() {
|
||||
size_t i = 0;
|
||||
for (const LexState &state : lex_table.states)
|
||||
for (const LexState &state : lex_table.states) {
|
||||
_case(to_string(i++), [&]() { add_lex_state(state); });
|
||||
_default([&]() { line("LEX_ERROR();"); });
|
||||
}
|
||||
_default([&]() { line("return false;"); });
|
||||
});
|
||||
});
|
||||
line("}");
|
||||
|
|
@ -396,18 +397,18 @@ class CCodeGenerator {
|
|||
}
|
||||
|
||||
void add_lex_state(const LexState &lex_state) {
|
||||
if (lex_state.is_token_start)
|
||||
line("START_TOKEN();");
|
||||
if (lex_state.accept_action.is_present()) {
|
||||
add_accept_token_action(lex_state.accept_action);
|
||||
}
|
||||
|
||||
for (const auto &pair : lex_state.advance_actions)
|
||||
if (!pair.first.is_empty())
|
||||
for (const auto &pair : lex_state.advance_actions) {
|
||||
if (!pair.first.is_empty()) {
|
||||
_if([&]() { add_character_set_condition(pair.first); },
|
||||
[&]() { add_advance_action(pair.second); });
|
||||
}
|
||||
}
|
||||
|
||||
if (lex_state.accept_action.is_present())
|
||||
add_accept_token_action(lex_state.accept_action);
|
||||
else
|
||||
line("LEX_ERROR();");
|
||||
line("END_STATE();");
|
||||
}
|
||||
|
||||
void add_character_set_condition(const rules::CharacterSet &rule) {
|
||||
|
|
@ -428,8 +429,7 @@ class CCodeGenerator {
|
|||
for (const auto &range : ranges) {
|
||||
if (!first) {
|
||||
add(" ||");
|
||||
line();
|
||||
add_padding();
|
||||
line(" ");
|
||||
}
|
||||
|
||||
add("(");
|
||||
|
|
@ -442,20 +442,20 @@ class CCodeGenerator {
|
|||
}
|
||||
|
||||
void add_character_range_condition(const rules::CharacterRange &range) {
|
||||
string lookahead("lookahead");
|
||||
if (range.min == range.max) {
|
||||
add(lookahead + " == " + escape_char(range.min));
|
||||
add("lookahead == " + escape_char(range.min));
|
||||
} else {
|
||||
add(escape_char(range.min) + string(" <= ") + lookahead + " && " +
|
||||
lookahead + " <= " + escape_char(range.max));
|
||||
add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") +
|
||||
escape_char(range.max));
|
||||
}
|
||||
}
|
||||
|
||||
void add_advance_action(const AdvanceAction &action) {
|
||||
if (action.in_main_token)
|
||||
if (action.in_main_token) {
|
||||
line("ADVANCE(" + to_string(action.state_index) + ");");
|
||||
else
|
||||
} else {
|
||||
line("SKIP(" + to_string(action.state_index) + ");");
|
||||
}
|
||||
}
|
||||
|
||||
void add_accept_token_action(const AcceptTokenAction &action) {
|
||||
|
|
@ -669,7 +669,7 @@ class CCodeGenerator {
|
|||
|
||||
void add_padding() {
|
||||
for (size_t i = 0; i < indent_level; i++)
|
||||
add(" ");
|
||||
add(" ");
|
||||
}
|
||||
|
||||
void indent(function<void()> body) {
|
||||
|
|
|
|||
|
|
@ -42,12 +42,9 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
|
|||
(is_string == other.is_string);
|
||||
}
|
||||
|
||||
LexState::LexState() : is_token_start(false) {}
|
||||
|
||||
bool LexState::operator==(const LexState &other) const {
|
||||
return advance_actions == other.advance_actions &&
|
||||
accept_action == other.accept_action &&
|
||||
is_token_start == other.is_token_start;
|
||||
accept_action == other.accept_action;
|
||||
}
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -35,12 +35,10 @@ struct AcceptTokenAction {
|
|||
};
|
||||
|
||||
struct LexState {
|
||||
LexState();
|
||||
bool operator==(const LexState &) const;
|
||||
|
||||
std::map<rules::CharacterSet, AdvanceAction> advance_actions;
|
||||
AcceptTokenAction accept_action;
|
||||
bool is_token_start;
|
||||
};
|
||||
|
||||
struct LexTable {
|
||||
|
|
|
|||
|
|
@ -16,6 +16,8 @@
|
|||
|
||||
static const char empty_chunk[2] = { 0, 0 };
|
||||
|
||||
static Length unknown_length = {UINT32_MAX, 0, {0, 0}};
|
||||
|
||||
static void ts_lexer__get_chunk(Lexer *self) {
|
||||
TSInput input = self->input;
|
||||
if (!self->chunk ||
|
||||
|
|
@ -70,6 +72,11 @@ static void ts_lexer__advance(void *payload, bool skip) {
|
|||
ts_lexer__get_lookahead(self);
|
||||
}
|
||||
|
||||
static void ts_lexer__mark_end(void *payload) {
|
||||
Lexer *self = (Lexer *)payload;
|
||||
self->token_end_position = self->current_position;
|
||||
}
|
||||
|
||||
/*
|
||||
* The lexer's advance method is stored as a struct field so that generated
|
||||
* parsers can call it without needing to be linked against this library.
|
||||
|
|
@ -79,6 +86,7 @@ void ts_lexer_init(Lexer *self) {
|
|||
*self = (Lexer){
|
||||
.data = {
|
||||
.advance = ts_lexer__advance,
|
||||
.mark_end = ts_lexer__mark_end,
|
||||
.lookahead = 0,
|
||||
.result_symbol = 0,
|
||||
},
|
||||
|
|
@ -95,6 +103,7 @@ void ts_lexer_init(Lexer *self) {
|
|||
|
||||
static inline void ts_lexer__reset(Lexer *self, Length position) {
|
||||
self->token_start_position = position;
|
||||
self->token_end_position = unknown_length;
|
||||
self->current_position = position;
|
||||
|
||||
if (self->chunk && (position.bytes < self->chunk_start ||
|
||||
|
|
@ -122,6 +131,7 @@ void ts_lexer_reset(Lexer *self, Length position) {
|
|||
|
||||
void ts_lexer_start(Lexer *self) {
|
||||
self->token_start_position = self->current_position;
|
||||
self->token_end_position = unknown_length;
|
||||
self->data.result_symbol = 0;
|
||||
|
||||
if (!self->chunk)
|
||||
|
|
|
|||
|
|
@ -15,11 +15,11 @@ typedef struct {
|
|||
TSLexer data;
|
||||
Length current_position;
|
||||
Length token_start_position;
|
||||
Length token_end_position;
|
||||
|
||||
const char *chunk;
|
||||
uint32_t chunk_start;
|
||||
uint32_t chunk_size;
|
||||
|
||||
uint32_t lookahead_size;
|
||||
|
||||
TSInput input;
|
||||
|
|
|
|||
|
|
@ -279,7 +279,6 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
|
|||
if (skipped_error) {
|
||||
Length padding = length_sub(error_start_position, start_position);
|
||||
Length size = length_sub(error_end_position, error_start_position);
|
||||
ts_lexer_reset(&self->lexer, error_end_position);
|
||||
result = ts_tree_make_error(size, padding, first_error_character);
|
||||
} else {
|
||||
TSSymbol symbol = self->lexer.data.result_symbol;
|
||||
|
|
@ -287,8 +286,11 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
|
|||
symbol = self->language->external_scanner.symbol_map[symbol];
|
||||
}
|
||||
|
||||
if (length_has_unknown_chars(self->lexer.token_end_position)) {
|
||||
self->lexer.token_end_position = self->lexer.current_position;
|
||||
}
|
||||
Length padding = length_sub(self->lexer.token_start_position, start_position);
|
||||
Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position);
|
||||
Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
|
||||
TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol);
|
||||
result = ts_tree_make_leaf(symbol, padding, size, metadata);
|
||||
|
||||
|
|
@ -301,6 +303,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
|
|||
}
|
||||
}
|
||||
|
||||
result->bytes_scanned = self->lexer.current_position.bytes - start_position.bytes + 1;
|
||||
result->parse_state = parse_state;
|
||||
result->first_leaf.lex_mode = lex_mode;
|
||||
|
||||
|
|
|
|||
|
|
@ -155,7 +155,10 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) {
|
|||
if (i == 0) {
|
||||
self->padding = child->padding;
|
||||
self->size = child->size;
|
||||
self->bytes_scanned = child->bytes_scanned;
|
||||
} else {
|
||||
uint32_t bytes_scanned = ts_tree_total_bytes(self) + child->bytes_scanned;
|
||||
if (bytes_scanned > self->bytes_scanned) self->bytes_scanned = bytes_scanned;
|
||||
self->size = length_add(self->size, ts_tree_total_size(child));
|
||||
}
|
||||
|
||||
|
|
@ -344,6 +347,21 @@ static inline long min(long a, long b) {
|
|||
return a <= b ? a : b;
|
||||
}
|
||||
|
||||
bool ts_tree_invalidate_lookahead(Tree *self, uint32_t edit_byte_offset) {
|
||||
if (edit_byte_offset >= self->bytes_scanned) return false;
|
||||
self->has_changes = true;
|
||||
if (self->child_count > 0) {
|
||||
uint32_t child_start_byte = 0;
|
||||
for (uint32_t i = 0; i < self->child_count; i++) {
|
||||
Tree *child = self->children[i];
|
||||
if (child_start_byte > edit_byte_offset) break;
|
||||
ts_tree_invalidate_lookahead(child, edit_byte_offset - child_start_byte);
|
||||
child_start_byte += ts_tree_total_bytes(child);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
|
||||
uint32_t old_end_byte = edit->start_byte + edit->bytes_removed;
|
||||
|
|
@ -390,29 +408,27 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
|
|||
for (uint32_t i = 0; i < self->child_count; i++) {
|
||||
Tree *child = self->children[i];
|
||||
child_left = child_right;
|
||||
child_right = length_add(child_left, ts_tree_total_size(child));
|
||||
|
||||
if (!found_first_child) {
|
||||
child_right = length_add(child_left, ts_tree_total_size(child));
|
||||
if (child_right.bytes >= edit->start_byte) {
|
||||
found_first_child = true;
|
||||
TSInputEdit child_edit = {
|
||||
.start_byte = edit->start_byte - child_left.bytes,
|
||||
.bytes_added = edit->bytes_added,
|
||||
.bytes_removed = edit->bytes_removed,
|
||||
.start_point = point_sub(edit->start_point, child_left.extent),
|
||||
.extent_added = edit->extent_added,
|
||||
.extent_removed = edit->extent_removed,
|
||||
};
|
||||
if (!found_first_child && child_right.bytes >= edit->start_byte) {
|
||||
found_first_child = true;
|
||||
TSInputEdit child_edit = {
|
||||
.start_byte = edit->start_byte - child_left.bytes,
|
||||
.bytes_added = edit->bytes_added,
|
||||
.bytes_removed = edit->bytes_removed,
|
||||
.start_point = point_sub(edit->start_point, child_left.extent),
|
||||
.extent_added = edit->extent_added,
|
||||
.extent_removed = edit->extent_removed,
|
||||
};
|
||||
|
||||
if (old_end_byte > child_right.bytes) {
|
||||
child_edit.bytes_removed = child_right.bytes - edit->start_byte;
|
||||
child_edit.extent_removed = point_sub(child_right.extent, edit->start_point);
|
||||
remaining_bytes_to_delete = old_end_byte - child_right.bytes;
|
||||
remaining_extent_to_delete = point_sub(old_end_point, child_right.extent);
|
||||
}
|
||||
|
||||
ts_tree_edit(child, &child_edit);
|
||||
if (old_end_byte > child_right.bytes) {
|
||||
child_edit.bytes_removed = child_right.bytes - edit->start_byte;
|
||||
child_edit.extent_removed = point_sub(child_right.extent, edit->start_point);
|
||||
remaining_bytes_to_delete = old_end_byte - child_right.bytes;
|
||||
remaining_extent_to_delete = point_sub(old_end_point, child_right.extent);
|
||||
}
|
||||
|
||||
ts_tree_edit(child, &child_edit);
|
||||
} else if (remaining_bytes_to_delete > 0) {
|
||||
TSInputEdit child_edit = {
|
||||
.start_byte = 0,
|
||||
|
|
@ -425,6 +441,8 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
|
|||
remaining_bytes_to_delete -= child_edit.bytes_removed;
|
||||
remaining_extent_to_delete = point_sub(remaining_extent_to_delete, child_edit.extent_removed);
|
||||
ts_tree_edit(child, &child_edit);
|
||||
} else {
|
||||
ts_tree_invalidate_lookahead(child, edit->start_byte - child_left.bytes);
|
||||
}
|
||||
|
||||
child_right = length_add(child_left, ts_tree_total_size(child));
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ typedef struct Tree {
|
|||
|
||||
Length padding;
|
||||
Length size;
|
||||
uint32_t bytes_scanned;
|
||||
|
||||
TSSymbol symbol;
|
||||
TSStateId parse_state;
|
||||
|
|
|
|||
|
|
@ -69,18 +69,15 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
|
|||
describe("advance/accept-token conflicts", [&]() {
|
||||
describe("when the token to accept has higher precedence", [&]() {
|
||||
it("prefers the accept-token action", [&]() {
|
||||
AssertThat(conflict_manager.possible_extensions, IsEmpty());
|
||||
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
|
||||
AssertThat(update, IsFalse());
|
||||
AssertThat(conflict_manager.possible_extensions, IsEmpty());
|
||||
});
|
||||
});
|
||||
|
||||
describe("when the token to accept does not have a higher precedence", [&]() {
|
||||
it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
|
||||
it("favors the advance action", [&]() {
|
||||
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
|
||||
AssertThat(update, IsTrue());
|
||||
AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -164,7 +164,7 @@ describe("Parser", [&]() {
|
|||
describe("when there is an unterminated error", [&]() {
|
||||
it("maintains a consistent tree", [&]() {
|
||||
ts_document_set_language(document, load_real_language("javascript"));
|
||||
set_text("a; /* b");
|
||||
set_text("a; ' this string never ends");
|
||||
assert_root_node(
|
||||
"(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue