Merge pull request #1783 from tree-sitter/empty-external-tokens-after-errors
Allow empty external tokens during error recovery, if they change the scanner's state
This commit is contained in:
commit
b37f915520
8 changed files with 229 additions and 88 deletions
|
|
@ -58,7 +58,7 @@ impl ScopeSequence {
|
|||
let mut position = Point { row: 0, column: 0 };
|
||||
for (i, stack) in self.0.iter().enumerate() {
|
||||
let other_stack = &other.0[i];
|
||||
if *stack != *other_stack {
|
||||
if *stack != *other_stack && ![b'\r', b'\n'].contains(&text[i]) {
|
||||
let containing_range = known_changed_ranges
|
||||
.iter()
|
||||
.find(|range| range.start_point <= position && position < range.end_point);
|
||||
|
|
|
|||
|
|
@ -283,6 +283,31 @@ static inline void ts_language_aliases_for_symbol(
|
|||
}
|
||||
}
|
||||
|
||||
static inline void ts_language_write_symbol_as_dot_string(
|
||||
const TSLanguage *self,
|
||||
FILE *f,
|
||||
TSSymbol symbol
|
||||
) {
|
||||
const char *name = ts_language_symbol_name(self, symbol);
|
||||
for (const char *c = name; *c; c++) {
|
||||
switch (*c) {
|
||||
case '"':
|
||||
case '\\':
|
||||
fputc('\\', f);
|
||||
fputc(*c, f);
|
||||
break;
|
||||
case '\n':
|
||||
fputs("\\n", f);
|
||||
break;
|
||||
case '\t':
|
||||
fputs("\\n", f);
|
||||
break;
|
||||
default:
|
||||
fputc(*c, f);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -160,7 +160,7 @@ static void ts_parser__log(TSParser *self) {
|
|||
if (self->dot_graph_file) {
|
||||
fprintf(self->dot_graph_file, "graph {\nlabel=\"");
|
||||
for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) {
|
||||
if (*c == '"') fputc('\\', self->dot_graph_file);
|
||||
if (*c == '"' || *c == '\\') fputc('\\', self->dot_graph_file);
|
||||
fputc(*c, self->dot_graph_file);
|
||||
}
|
||||
fprintf(self->dot_graph_file, "\"\n}\n\n");
|
||||
|
|
@ -393,8 +393,8 @@ static Subtree ts_parser__lex(
|
|||
return NULL_SUBTREE;
|
||||
}
|
||||
|
||||
Length start_position = ts_stack_position(self->stack, version);
|
||||
Subtree external_token = ts_stack_last_external_token(self->stack, version);
|
||||
const Length start_position = ts_stack_position(self->stack, version);
|
||||
const Subtree external_token = ts_stack_last_external_token(self->stack, version);
|
||||
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
|
||||
self->language,
|
||||
lex_mode.external_lex_state
|
||||
|
|
@ -408,6 +408,8 @@ static Subtree ts_parser__lex(
|
|||
Length error_start_position = length_zero();
|
||||
Length error_end_position = length_zero();
|
||||
uint32_t lookahead_end_byte = 0;
|
||||
uint32_t external_scanner_state_len = 0;
|
||||
bool external_scanner_state_changed = false;
|
||||
ts_lexer_reset(&self->lexer, start_position);
|
||||
|
||||
for (;;) {
|
||||
|
|
@ -429,22 +431,36 @@ static Subtree ts_parser__lex(
|
|||
);
|
||||
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
|
||||
|
||||
// Zero-length external tokens are generally allowed, but they're not
|
||||
// allowed right after a syntax error. This is for two reasons:
|
||||
// 1. After a syntax error, the lexer is looking for any possible token,
|
||||
// as opposed to the specific set of tokens that are valid in some
|
||||
// parse state. In this situation, it's very easy for an external
|
||||
// scanner to produce unwanted zero-length tokens.
|
||||
// 2. The parser sometimes inserts *missing* tokens to recover from
|
||||
// errors. These tokens are also zero-length. If we allow more
|
||||
// zero-length tokens to be created after missing tokens, it
|
||||
// can lead to infinite loops. Forbidding zero-length tokens
|
||||
// right at the point of error recovery is a conservative strategy
|
||||
// for preventing this kind of infinite loop.
|
||||
if (found_token && (
|
||||
self->lexer.token_end_position.bytes > current_position.bytes ||
|
||||
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
|
||||
)) {
|
||||
if (found_token) {
|
||||
external_scanner_state_len = self->language->external_scanner.serialize(
|
||||
self->external_scanner_payload,
|
||||
self->lexer.debug_buffer
|
||||
);
|
||||
external_scanner_state_changed = !ts_external_scanner_state_eq(
|
||||
ts_subtree_external_scanner_state(external_token),
|
||||
self->lexer.debug_buffer,
|
||||
external_scanner_state_len
|
||||
);
|
||||
|
||||
// When recovering from an error, ignore any zero-length external tokens
|
||||
// unless they have changed the external scanner's state. This helps to
|
||||
// avoid infinite loops which could otherwise occur, because the lexer is
|
||||
// looking for any possible token, instead of looking for the specific set of
|
||||
// tokens that are valid in some parse state.
|
||||
if (
|
||||
self->lexer.token_end_position.bytes == current_position.bytes &&
|
||||
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) &&
|
||||
!external_scanner_state_changed
|
||||
) {
|
||||
LOG(
|
||||
"ignore_empty_external_token symbol:%s",
|
||||
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
|
||||
)
|
||||
found_token = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_token) {
|
||||
found_external_token = true;
|
||||
called_get_column = self->lexer.did_get_column;
|
||||
break;
|
||||
|
|
@ -508,11 +524,6 @@ static Subtree ts_parser__lex(
|
|||
parse_state,
|
||||
self->language
|
||||
);
|
||||
|
||||
LOG_LOOKAHEAD(
|
||||
SYM_NAME(ts_subtree_symbol(result)),
|
||||
ts_subtree_total_size(result).bytes
|
||||
);
|
||||
} else {
|
||||
if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) {
|
||||
self->lexer.token_start_position = self->lexer.token_end_position;
|
||||
|
|
@ -554,23 +565,20 @@ static Subtree ts_parser__lex(
|
|||
);
|
||||
|
||||
if (found_external_token) {
|
||||
unsigned length = self->language->external_scanner.serialize(
|
||||
self->external_scanner_payload,
|
||||
self->lexer.debug_buffer
|
||||
);
|
||||
MutableSubtree mut_result = ts_subtree_to_mut_unsafe(result);
|
||||
ts_external_scanner_state_init(
|
||||
&((SubtreeHeapData *)result.ptr)->external_scanner_state,
|
||||
&mut_result.ptr->external_scanner_state,
|
||||
self->lexer.debug_buffer,
|
||||
length
|
||||
external_scanner_state_len
|
||||
);
|
||||
mut_result.ptr->has_external_scanner_state_change = external_scanner_state_changed;
|
||||
}
|
||||
|
||||
LOG_LOOKAHEAD(
|
||||
SYM_NAME(ts_subtree_symbol(result)),
|
||||
ts_subtree_total_size(result).bytes
|
||||
);
|
||||
}
|
||||
|
||||
LOG_LOOKAHEAD(
|
||||
SYM_NAME(ts_subtree_symbol(result)),
|
||||
ts_subtree_total_size(result).bytes
|
||||
);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -1205,6 +1213,15 @@ static void ts_parser__recover(
|
|||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
did_recover &&
|
||||
ts_subtree_has_external_scanner_state_change(lookahead)
|
||||
) {
|
||||
ts_stack_halt(self->stack, version);
|
||||
ts_subtree_release(&self->tree_pool, lookahead);
|
||||
return;
|
||||
}
|
||||
|
||||
// If the parser is still in the error state at the end of the file, just wrap everything
|
||||
// in an ERROR node and terminate.
|
||||
if (ts_subtree_is_eof(lookahead)) {
|
||||
|
|
@ -1539,6 +1556,13 @@ static bool ts_parser__advance(
|
|||
continue;
|
||||
}
|
||||
|
||||
// A non-terminal extra rule was reduced and merged into an existing
|
||||
// stack version. This version can be discarded.
|
||||
if (!lookahead.ptr) {
|
||||
ts_stack_halt(self->stack, version);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If there were no parse actions for the current lookahead token, then
|
||||
// it is not valid in this state. If the current lookahead token is a
|
||||
// keyword, then switch to treating it as the normal word token if that
|
||||
|
|
@ -1928,6 +1952,7 @@ TSTree *ts_parser_parse(
|
|||
}
|
||||
} while (version_count != 0);
|
||||
|
||||
assert(self->finished_tree.ptr);
|
||||
ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language);
|
||||
LOG("done");
|
||||
LOG_TREE(self->finished_tree);
|
||||
|
|
|
|||
|
|
@ -846,11 +846,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f)
|
|||
fprintf(f, "label=\"");
|
||||
bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree);
|
||||
if (quoted) fprintf(f, "'");
|
||||
const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree));
|
||||
for (const char *c = name; *c; c++) {
|
||||
if (*c == '\"' || *c == '\\') fprintf(f, "\\");
|
||||
fprintf(f, "%c", *c);
|
||||
}
|
||||
ts_language_write_symbol_as_dot_string(language, f, ts_subtree_symbol(link.subtree));
|
||||
if (quoted) fprintf(f, "'");
|
||||
fprintf(f, "\"");
|
||||
fprintf(
|
||||
|
|
|
|||
|
|
@ -21,8 +21,6 @@ typedef struct {
|
|||
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
|
||||
#define TS_MAX_TREE_POOL_SIZE 32
|
||||
|
||||
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
|
||||
|
||||
// ExternalScannerState
|
||||
|
||||
void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) {
|
||||
|
|
@ -58,11 +56,10 @@ const char *ts_external_scanner_state_data(const ExternalScannerState *self) {
|
|||
}
|
||||
}
|
||||
|
||||
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) {
|
||||
return a == b || (
|
||||
a->length == b->length &&
|
||||
!memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length)
|
||||
);
|
||||
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *buffer, unsigned length) {
|
||||
return
|
||||
a->length == length &&
|
||||
memcmp(ts_external_scanner_state_data(a), buffer, length) == 0;
|
||||
}
|
||||
|
||||
// SubtreeArray
|
||||
|
|
@ -214,6 +211,7 @@ Subtree ts_subtree_new_leaf(
|
|||
.fragile_right = false,
|
||||
.has_changes = false,
|
||||
.has_external_tokens = has_external_tokens,
|
||||
.has_external_scanner_state_change = false,
|
||||
.depends_on_column = depends_on_column,
|
||||
.is_missing = false,
|
||||
.is_keyword = is_keyword,
|
||||
|
|
@ -381,6 +379,7 @@ void ts_subtree_summarize_children(
|
|||
self.ptr->node_count = 1;
|
||||
self.ptr->has_external_tokens = false;
|
||||
self.ptr->depends_on_column = false;
|
||||
self.ptr->has_external_scanner_state_change = false;
|
||||
self.ptr->dynamic_precedence = 0;
|
||||
|
||||
uint32_t structural_index = 0;
|
||||
|
|
@ -398,6 +397,10 @@ void ts_subtree_summarize_children(
|
|||
self.ptr->depends_on_column = true;
|
||||
}
|
||||
|
||||
if (ts_subtree_has_external_scanner_state_change(child)) {
|
||||
self.ptr->has_external_scanner_state_change = true;
|
||||
}
|
||||
|
||||
if (i == 0) {
|
||||
self.ptr->padding = ts_subtree_padding(child);
|
||||
self.ptr->size = ts_subtree_size(child);
|
||||
|
|
@ -521,6 +524,7 @@ MutableSubtree ts_subtree_new_node(
|
|||
.visible = metadata.visible,
|
||||
.named = metadata.named,
|
||||
.has_changes = false,
|
||||
.has_external_scanner_state_change = false,
|
||||
.fragile_left = fragile,
|
||||
.fragile_right = fragile,
|
||||
.is_keyword = false,
|
||||
|
|
@ -830,18 +834,6 @@ static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
|
|||
return snprintf(s, n, "%d", c);
|
||||
}
|
||||
|
||||
static void ts_subtree__write_dot_string(FILE *f, const char *string) {
|
||||
for (const char *c = string; *c; c++) {
|
||||
if (*c == '"') {
|
||||
fputs("\\\"", f);
|
||||
} else if (*c == '\n') {
|
||||
fputs("\\n", f);
|
||||
} else {
|
||||
fputc(*c, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const char *ROOT_FIELD = "__ROOT__";
|
||||
|
||||
static size_t ts_subtree__write_to_string(
|
||||
|
|
@ -971,7 +963,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset,
|
|||
TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol;
|
||||
uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self);
|
||||
fprintf(f, "tree_%p [label=\"", (void *)self);
|
||||
ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol));
|
||||
ts_language_write_symbol_as_dot_string(language, f, symbol);
|
||||
fprintf(f, "\"");
|
||||
|
||||
if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext");
|
||||
|
|
@ -1024,14 +1016,26 @@ void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *
|
|||
fprintf(f, "}\n");
|
||||
}
|
||||
|
||||
bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) {
|
||||
const ExternalScannerState *state1 = &empty_state;
|
||||
const ExternalScannerState *state2 = &empty_state;
|
||||
if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) {
|
||||
state1 = &self.ptr->external_scanner_state;
|
||||
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self) {
|
||||
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
|
||||
if (
|
||||
self.ptr &&
|
||||
!self.data.is_inline &&
|
||||
self.ptr->has_external_tokens &&
|
||||
self.ptr->child_count == 0
|
||||
) {
|
||||
return &self.ptr->external_scanner_state;
|
||||
} else {
|
||||
return &empty_state;
|
||||
}
|
||||
if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) {
|
||||
state2 = &other.ptr->external_scanner_state;
|
||||
}
|
||||
return ts_external_scanner_state_eq(state1, state2);
|
||||
}
|
||||
|
||||
bool ts_subtree_external_scanner_state_eq(Subtree a, Subtree b) {
|
||||
const ExternalScannerState *state_a = ts_subtree_external_scanner_state(a);
|
||||
const ExternalScannerState *state_b = ts_subtree_external_scanner_state(b);
|
||||
return ts_external_scanner_state_eq(
|
||||
state_a,
|
||||
ts_external_scanner_state_data(state_b),
|
||||
state_b->length
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ typedef struct {
|
|||
Length size;
|
||||
uint32_t lookahead_bytes;
|
||||
uint32_t error_cost;
|
||||
uint32_t child_count;
|
||||
uint16_t child_count;
|
||||
TSSymbol symbol;
|
||||
TSStateId parse_state;
|
||||
|
||||
|
|
@ -125,6 +125,7 @@ typedef struct {
|
|||
bool fragile_right : 1;
|
||||
bool has_changes : 1;
|
||||
bool has_external_tokens : 1;
|
||||
bool has_external_scanner_state_change : 1;
|
||||
bool depends_on_column: 1;
|
||||
bool is_missing : 1;
|
||||
bool is_keyword : 1;
|
||||
|
|
@ -135,8 +136,8 @@ typedef struct {
|
|||
uint32_t visible_child_count;
|
||||
uint32_t named_child_count;
|
||||
uint32_t node_count;
|
||||
uint32_t repeat_depth;
|
||||
int32_t dynamic_precedence;
|
||||
uint16_t repeat_depth;
|
||||
uint16_t production_id;
|
||||
struct {
|
||||
TSSymbol symbol;
|
||||
|
|
@ -174,6 +175,8 @@ typedef struct {
|
|||
|
||||
void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned);
|
||||
const char *ts_external_scanner_state_data(const ExternalScannerState *);
|
||||
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *, unsigned);
|
||||
void ts_external_scanner_state_delete(ExternalScannerState *self);
|
||||
|
||||
void ts_subtree_array_copy(SubtreeArray, SubtreeArray *);
|
||||
void ts_subtree_array_clear(SubtreePool *, SubtreeArray *);
|
||||
|
|
@ -206,6 +209,7 @@ Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *);
|
|||
char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all);
|
||||
void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *);
|
||||
Subtree ts_subtree_last_external_token(Subtree);
|
||||
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self);
|
||||
bool ts_subtree_external_scanner_state_eq(Subtree, Subtree);
|
||||
|
||||
#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name)
|
||||
|
|
@ -331,6 +335,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) {
|
|||
return self.data.is_inline ? false : self.ptr->has_external_tokens;
|
||||
}
|
||||
|
||||
static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) {
|
||||
return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change;
|
||||
}
|
||||
|
||||
static inline bool ts_subtree_depends_on_column(Subtree self) {
|
||||
return self.data.is_inline ? false : self.ptr->depends_on_column;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,11 +14,9 @@ OPTIONS
|
|||
|
||||
-a Compile C code with the Clang static analyzer
|
||||
|
||||
-l Run only the corpus tests for the given language
|
||||
|
||||
-e Run only the corpus tests whose name contain the given string
|
||||
|
||||
-t Run only the given trial number of randomized test
|
||||
-i Run the given number of iterations of randomized tests (default 10)
|
||||
|
||||
-s Set the seed used to control random behavior
|
||||
|
||||
|
|
@ -36,7 +34,7 @@ export RUST_BACKTRACE=full
|
|||
mode=normal
|
||||
test_flags=""
|
||||
|
||||
while getopts "adDghl:e:s:t:" option; do
|
||||
while getopts "adDghl:e:s:i:" option; do
|
||||
case ${option} in
|
||||
h)
|
||||
usage
|
||||
|
|
@ -62,6 +60,9 @@ while getopts "adDghl:e:s:t:" option; do
|
|||
s)
|
||||
export TREE_SITTER_SEED=${OPTARG}
|
||||
;;
|
||||
i)
|
||||
export TREE_SITTER_ITERATIONS=${OPTARG}
|
||||
;;
|
||||
d)
|
||||
export TREE_SITTER_LOG=1
|
||||
;;
|
||||
|
|
|
|||
106
test/fixtures/error_corpus/python_errors.txt
vendored
106
test/fixtures/error_corpus/python_errors.txt
vendored
|
|
@ -1,29 +1,111 @@
|
|||
==========================================
|
||||
errors in if statements
|
||||
==========================================
|
||||
=============================================
|
||||
incomplete condition in if statement
|
||||
=============================================
|
||||
|
||||
if a is:
|
||||
print b
|
||||
print c
|
||||
print b
|
||||
print c
|
||||
print d
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(if_statement (identifier) (ERROR) (block
|
||||
(print_statement (identifier))
|
||||
(print_statement (identifier)))))
|
||||
(if_statement
|
||||
condition: (identifier)
|
||||
(ERROR)
|
||||
consequence: (block
|
||||
(print_statement argument: (identifier))
|
||||
(print_statement argument: (identifier))))
|
||||
(print_statement argument: (identifier)))
|
||||
|
||||
==========================================
|
||||
errors in function definitions
|
||||
extra colon in function definition
|
||||
==========================================
|
||||
|
||||
def a()::
|
||||
b
|
||||
c
|
||||
d
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(function_definition (identifier) (parameters) (ERROR) (block
|
||||
(expression_statement (identifier))
|
||||
(expression_statement (identifier)))))
|
||||
(function_definition
|
||||
name: (identifier)
|
||||
parameters: (parameters)
|
||||
(ERROR)
|
||||
body: (block
|
||||
(expression_statement (identifier))
|
||||
(expression_statement (identifier))))
|
||||
(expression_statement (identifier)))
|
||||
|
||||
========================================================
|
||||
stray if keyword in function definition
|
||||
========================================================
|
||||
|
||||
def a():
|
||||
if
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(function_definition
|
||||
name: (identifier)
|
||||
parameters: (parameters)
|
||||
(ERROR)
|
||||
body: (block)))
|
||||
|
||||
========================================================
|
||||
incomplete if statement in function definition
|
||||
========================================================
|
||||
|
||||
def a():
|
||||
if a
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(function_definition
|
||||
name: (identifier)
|
||||
parameters: (parameters)
|
||||
(ERROR (identifier))
|
||||
body: (block)))
|
||||
|
||||
========================================================
|
||||
incomplete expression before triple-quoted string
|
||||
========================================================
|
||||
|
||||
def a():
|
||||
b.
|
||||
"""
|
||||
c
|
||||
"""
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(function_definition
|
||||
name: (identifier)
|
||||
parameters: (parameters)
|
||||
(ERROR (identifier))
|
||||
body: (block
|
||||
(expression_statement (string)))))
|
||||
|
||||
===========================================
|
||||
incomplete definition in class definition
|
||||
===========================================
|
||||
|
||||
class A:
|
||||
def
|
||||
|
||||
b
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(class_definition
|
||||
name: (identifier)
|
||||
(ERROR)
|
||||
body: (block))
|
||||
(expression_statement
|
||||
(identifier)))
|
||||
Loading…
Add table
Add a link
Reference in a new issue