Merge pull request #35 from tree-sitter/handle-invalid-chars-at-eof
Handle invalid chars at EOF
This commit is contained in:
commit
f8b85965a3
9 changed files with 94 additions and 112 deletions
|
|
@ -28,19 +28,11 @@ typedef struct {
|
|||
bool structural : 1;
|
||||
} TSSymbolMetadata;
|
||||
|
||||
typedef enum {
|
||||
TSTransitionTypeMain,
|
||||
TSTransitionTypeSeparator,
|
||||
TSTransitionTypeError,
|
||||
} TSTransitionType;
|
||||
|
||||
typedef struct TSLexer {
|
||||
void (*advance)(struct TSLexer *, TSStateId, TSTransitionType);
|
||||
void (*advance)(struct TSLexer *, TSStateId, bool);
|
||||
|
||||
TSLength current_position;
|
||||
TSLength token_end_position;
|
||||
TSLength token_start_position;
|
||||
TSLength error_end_position;
|
||||
|
||||
const char *chunk;
|
||||
size_t chunk_start;
|
||||
|
|
@ -48,10 +40,7 @@ typedef struct TSLexer {
|
|||
|
||||
size_t lookahead_size;
|
||||
int32_t lookahead;
|
||||
TSStateId starting_state;
|
||||
TSSymbol result_symbol;
|
||||
bool result_follows_error;
|
||||
int32_t first_unexpected_character;
|
||||
|
||||
TSInput input;
|
||||
TSDebugger debugger;
|
||||
|
|
@ -94,7 +83,7 @@ struct TSLanguage {
|
|||
const unsigned short *parse_table;
|
||||
const TSParseActionEntry *parse_actions;
|
||||
const TSStateId *lex_states;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId, bool);
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -106,22 +95,18 @@ struct TSLanguage {
|
|||
next_state: \
|
||||
lookahead = lexer->lookahead;
|
||||
|
||||
#define GO_TO_STATE(state_value) \
|
||||
{ \
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
lexer->advance(lexer, state_value, false); \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
lexer->advance(lexer, state_value, TSTransitionTypeMain); \
|
||||
GO_TO_STATE(state_value); \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
lexer->advance(lexer, state_value, TSTransitionTypeSeparator); \
|
||||
GO_TO_STATE(state_value); \
|
||||
lexer->advance(lexer, state_value, true); \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ACCEPT_TOKEN(symbol_value) \
|
||||
|
|
@ -130,14 +115,7 @@ struct TSLanguage {
|
|||
return true; \
|
||||
}
|
||||
|
||||
#define LEX_ERROR() \
|
||||
if (error_mode) { \
|
||||
if (state == TS_STATE_ERROR) \
|
||||
lexer->advance(lexer, state, TSTransitionTypeError); \
|
||||
GO_TO_STATE(TS_STATE_ERROR); \
|
||||
} else { \
|
||||
return false; \
|
||||
}
|
||||
#define LEX_ERROR() return false
|
||||
|
||||
/*
|
||||
* Parse Table Macros
|
||||
|
|
|
|||
|
|
@ -162,6 +162,15 @@ describe("Parser", [&]() {
|
|||
AssertThat(get_node_text(last), Equals("true"));
|
||||
});
|
||||
});
|
||||
|
||||
describe("when there is an unterminated error", [&]() {
|
||||
it("maintains a consistent tree", [&]() {
|
||||
ts_document_set_language(doc, get_test_language("javascript"));
|
||||
set_text("a; /* b");
|
||||
assert_root_node(
|
||||
"(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("handling extra tokens", [&]() {
|
||||
|
|
|
|||
|
|
@ -184,7 +184,7 @@ class CCodeGenerator {
|
|||
|
||||
void add_lex_function() {
|
||||
line(
|
||||
"static bool ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) {");
|
||||
"static bool ts_lex(TSLexer *lexer, TSStateId state) {");
|
||||
indent([&]() {
|
||||
line("START_LEXER();");
|
||||
_switch("state", [&]() {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,10 @@ static inline void ts_length_set_unknown(TSLength *self) {
|
|||
self->columns = 0;
|
||||
}
|
||||
|
||||
static inline TSLength ts_length_min(TSLength len1, TSLength len2) {
|
||||
return (len1.chars < len2.chars) ? len1 : len2;
|
||||
}
|
||||
|
||||
static inline TSLength ts_length_add(TSLength len1, TSLength len2) {
|
||||
TSLength result;
|
||||
result.chars = len1.chars + len2.chars;
|
||||
|
|
|
|||
|
|
@ -47,8 +47,7 @@ static void ts_lexer__get_lookahead(TSLexer *self) {
|
|||
LOG_LOOKAHEAD();
|
||||
}
|
||||
|
||||
static void ts_lexer__advance(TSLexer *self, TSStateId state,
|
||||
TSTransitionType transition_type) {
|
||||
static void ts_lexer__advance(TSLexer *self, TSStateId state, bool skip) {
|
||||
|
||||
if (self->chunk == empty_chunk)
|
||||
return;
|
||||
|
|
@ -56,7 +55,6 @@ static void ts_lexer__advance(TSLexer *self, TSStateId state,
|
|||
if (self->lookahead_size) {
|
||||
self->current_position.bytes += self->lookahead_size;
|
||||
self->current_position.chars++;
|
||||
|
||||
if (self->lookahead == '\n') {
|
||||
self->current_position.rows++;
|
||||
self->current_position.columns = 0;
|
||||
|
|
@ -65,25 +63,11 @@ static void ts_lexer__advance(TSLexer *self, TSStateId state,
|
|||
}
|
||||
}
|
||||
|
||||
switch (transition_type) {
|
||||
case TSTransitionTypeSeparator:
|
||||
if (self->result_follows_error) {
|
||||
LOG("skip_error state:%d", state);
|
||||
} else {
|
||||
LOG("skip_separator state:%d", state);
|
||||
self->token_start_position = self->current_position;
|
||||
}
|
||||
break;
|
||||
case TSTransitionTypeError:
|
||||
LOG("skip_error state:%d", state);
|
||||
self->result_follows_error = true;
|
||||
self->error_end_position = self->current_position;
|
||||
if (!self->first_unexpected_character)
|
||||
self->first_unexpected_character = self->lookahead;
|
||||
break;
|
||||
default:
|
||||
LOG("advance state:%d", state);
|
||||
break;
|
||||
if (skip) {
|
||||
LOG("skip_separator state:%d", state);
|
||||
self->token_start_position = self->current_position;
|
||||
} else {
|
||||
LOG("advance state:%d", state);
|
||||
}
|
||||
|
||||
if (self->current_position.bytes >= self->chunk_start + self->chunk_size)
|
||||
|
|
@ -109,7 +93,6 @@ void ts_lexer_init(TSLexer *self) {
|
|||
|
||||
static inline void ts_lexer__reset(TSLexer *self, TSLength position) {
|
||||
self->token_start_position = position;
|
||||
self->token_end_position = position;
|
||||
self->current_position = position;
|
||||
|
||||
self->chunk = 0;
|
||||
|
|
@ -132,34 +115,12 @@ void ts_lexer_reset(TSLexer *self, TSLength position) {
|
|||
|
||||
void ts_lexer_start(TSLexer *self, TSStateId lex_state) {
|
||||
LOG("start_lex state:%d, pos:%lu", lex_state, self->current_position.chars);
|
||||
LOG_LOOKAHEAD();
|
||||
|
||||
self->starting_state = lex_state;
|
||||
self->token_start_position = self->current_position;
|
||||
self->result_follows_error = false;
|
||||
self->result_symbol = 0;
|
||||
self->first_unexpected_character = 0;
|
||||
|
||||
if (!self->chunk)
|
||||
ts_lexer__get_chunk(self);
|
||||
if (!self->lookahead_size)
|
||||
ts_lexer__get_lookahead(self);
|
||||
}
|
||||
|
||||
void ts_lexer_finish(TSLexer *self, TSLexerResult *result) {
|
||||
result->padding =
|
||||
ts_length_sub(self->token_start_position, self->token_end_position);
|
||||
|
||||
if (self->result_follows_error) {
|
||||
result->symbol = ts_builtin_sym_error;
|
||||
result->size =
|
||||
ts_length_sub(self->error_end_position, self->token_start_position);
|
||||
result->first_unexpected_character = self->first_unexpected_character;
|
||||
ts_lexer_reset(self, self->error_end_position);
|
||||
} else {
|
||||
result->symbol = self->result_symbol;
|
||||
result->size =
|
||||
ts_length_sub(self->current_position, self->token_start_position);
|
||||
self->token_end_position = self->current_position;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,18 +7,10 @@ extern "C" {
|
|||
|
||||
#include "tree_sitter/parser.h"
|
||||
|
||||
typedef struct {
|
||||
TSSymbol symbol;
|
||||
TSLength padding;
|
||||
TSLength size;
|
||||
int32_t first_unexpected_character;
|
||||
} TSLexerResult;
|
||||
|
||||
void ts_lexer_init(TSLexer *);
|
||||
void ts_lexer_set_input(TSLexer *, TSInput);
|
||||
void ts_lexer_reset(TSLexer *, TSLength);
|
||||
void ts_lexer_start(TSLexer *, TSStateId);
|
||||
void ts_lexer_finish(TSLexer *, TSLexerResult *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -242,37 +242,67 @@ static bool parser__condense_stack(Parser *self) {
|
|||
return result;
|
||||
}
|
||||
|
||||
static TSTree *parser__lex(Parser *self, TSStateId parse_state, bool error_mode) {
|
||||
TSStateId state = self->language->lex_states[parse_state];
|
||||
LOG("lex state:%d", state);
|
||||
static TSTree *parser__lex(Parser *self, TSStateId parse_state) {
|
||||
TSStateId start_state = self->language->lex_states[parse_state];
|
||||
TSStateId current_state = start_state;
|
||||
TSLength start_position = self->lexer.current_position;
|
||||
TSLength position = start_position;
|
||||
LOG("lex state:%d", start_state);
|
||||
|
||||
TSLength position = self->lexer.current_position;
|
||||
bool skipped_error = false;
|
||||
int32_t first_error_character = 0;
|
||||
TSLength error_start_position, error_end_position;
|
||||
|
||||
ts_lexer_start(&self->lexer, state);
|
||||
if (!self->language->lex_fn(&self->lexer, state, error_mode)) {
|
||||
ts_lexer_reset(&self->lexer, position);
|
||||
ts_lexer_start(&self->lexer, state);
|
||||
assert(self->language->lex_fn(&self->lexer, TS_STATE_ERROR, true));
|
||||
ts_lexer_start(&self->lexer, start_state);
|
||||
|
||||
while (!self->language->lex_fn(&self->lexer, current_state)) {
|
||||
if (current_state != TS_STATE_ERROR) {
|
||||
LOG("retry_in_error_mode");
|
||||
ts_lexer_reset(&self->lexer, position);
|
||||
ts_lexer_start(&self->lexer, start_state);
|
||||
current_state = TS_STATE_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (self->lexer.lookahead == 0) {
|
||||
self->lexer.result_symbol = ts_builtin_sym_error;
|
||||
break;
|
||||
}
|
||||
|
||||
if (self->lexer.current_position.chars == position.chars) {
|
||||
if (!skipped_error) {
|
||||
error_start_position = self->lexer.current_position;
|
||||
first_error_character = self->lexer.lookahead;
|
||||
}
|
||||
skipped_error = true;
|
||||
self->lexer.advance(&self->lexer, TS_STATE_ERROR, false);
|
||||
error_end_position = self->lexer.current_position;
|
||||
}
|
||||
|
||||
position = self->lexer.current_position;
|
||||
}
|
||||
|
||||
TSLexerResult lex_result;
|
||||
ts_lexer_finish(&self->lexer, &lex_result);
|
||||
|
||||
TSTree *result;
|
||||
if (lex_result.symbol == ts_builtin_sym_error) {
|
||||
result = ts_tree_make_error(lex_result.size, lex_result.padding,
|
||||
lex_result.first_unexpected_character);
|
||||
|
||||
if (skipped_error) {
|
||||
error_start_position = ts_length_min(error_start_position, self->lexer.token_start_position);
|
||||
TSLength padding = ts_length_sub(error_start_position, start_position);
|
||||
TSLength size = ts_length_sub(error_end_position, error_start_position);
|
||||
ts_lexer_reset(&self->lexer, error_end_position);
|
||||
result = ts_tree_make_error(size, padding, first_error_character);
|
||||
} else {
|
||||
result = ts_tree_make_leaf(
|
||||
lex_result.symbol, lex_result.padding, lex_result.size,
|
||||
ts_language_symbol_metadata(self->language, lex_result.symbol));
|
||||
if (!result)
|
||||
return NULL;
|
||||
result->parse_state = parse_state;
|
||||
TSSymbol symbol = self->lexer.result_symbol;
|
||||
TSLength padding = ts_length_sub(self->lexer.token_start_position, start_position);
|
||||
TSLength size = ts_length_sub(self->lexer.current_position, self->lexer.token_start_position);
|
||||
result = ts_tree_make_leaf(symbol, padding, size,
|
||||
ts_language_symbol_metadata(self->language, symbol));
|
||||
}
|
||||
|
||||
result->first_leaf.lex_state = state;
|
||||
if (!result)
|
||||
return NULL;
|
||||
|
||||
result->parse_state = parse_state;
|
||||
result->first_leaf.lex_state = start_state;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -333,8 +363,7 @@ static TSTree *parser__get_lookahead(Parser *self, StackVersion version,
|
|||
|
||||
ts_lexer_reset(&self->lexer, position);
|
||||
TSStateId parse_state = ts_stack_top_state(self->stack, version);
|
||||
bool error_mode = parse_state == TS_STATE_ERROR;
|
||||
return parser__lex(self, parse_state, error_mode);
|
||||
return parser__lex(self, parse_state);
|
||||
|
||||
error:
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -369,6 +369,15 @@ void ts_tree_edit(TSTree *self, TSInputEdit edit) {
|
|||
}
|
||||
}
|
||||
|
||||
static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) {
|
||||
if (c == 0)
|
||||
return snprintf(s, n, "EOF");
|
||||
else if (c < 128)
|
||||
return snprintf(s, n, "'%c'", c);
|
||||
else
|
||||
return snprintf(s, n, "%d", c);
|
||||
}
|
||||
|
||||
static size_t ts_tree__write_to_string(const TSTree *self,
|
||||
const TSLanguage *language, char *string,
|
||||
size_t limit, bool is_root,
|
||||
|
|
@ -386,8 +395,8 @@ static size_t ts_tree__write_to_string(const TSTree *self,
|
|||
if (visible) {
|
||||
if (self->symbol == ts_builtin_sym_error && self->child_count == 0 &&
|
||||
self->size.chars > 0) {
|
||||
cursor +=
|
||||
snprintf(*writer, limit, "(UNEXPECTED '%c'", self->lookahead_char);
|
||||
cursor += snprintf(*writer, limit, "(UNEXPECTED ");
|
||||
cursor += ts_tree__write_char_to_string(*writer, limit, self->lookahead_char);
|
||||
} else {
|
||||
cursor += snprintf(*writer, limit, "(%s",
|
||||
ts_language_symbol_name(language, self->symbol));
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ typedef struct TSTree {
|
|||
size_t named_child_count;
|
||||
union {
|
||||
struct TSTree **children;
|
||||
char lookahead_char;
|
||||
int32_t lookahead_char;
|
||||
};
|
||||
|
||||
TSLength padding;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue