Merge pull request #1783 from tree-sitter/empty-external-tokens-after-errors

Allow empty external tokens during error recovery, if they change the scanner's state
This commit is contained in:
Max Brunsfeld 2022-06-25 17:31:35 -07:00 committed by GitHub
commit b37f915520
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 229 additions and 88 deletions

View file

@ -58,7 +58,7 @@ impl ScopeSequence {
let mut position = Point { row: 0, column: 0 };
for (i, stack) in self.0.iter().enumerate() {
let other_stack = &other.0[i];
if *stack != *other_stack {
if *stack != *other_stack && ![b'\r', b'\n'].contains(&text[i]) {
let containing_range = known_changed_ranges
.iter()
.find(|range| range.start_point <= position && position < range.end_point);

View file

@ -283,6 +283,31 @@ static inline void ts_language_aliases_for_symbol(
}
}
static inline void ts_language_write_symbol_as_dot_string(
const TSLanguage *self,
FILE *f,
TSSymbol symbol
) {
const char *name = ts_language_symbol_name(self, symbol);
for (const char *c = name; *c; c++) {
switch (*c) {
case '"':
case '\\':
fputc('\\', f);
fputc(*c, f);
break;
case '\n':
fputs("\\n", f);
break;
case '\t':
fputs("\\n", f);
break;
default:
fputc(*c, f);
break;
}
}
}
#ifdef __cplusplus
}

View file

@ -160,7 +160,7 @@ static void ts_parser__log(TSParser *self) {
if (self->dot_graph_file) {
fprintf(self->dot_graph_file, "graph {\nlabel=\"");
for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) {
if (*c == '"') fputc('\\', self->dot_graph_file);
if (*c == '"' || *c == '\\') fputc('\\', self->dot_graph_file);
fputc(*c, self->dot_graph_file);
}
fprintf(self->dot_graph_file, "\"\n}\n\n");
@ -393,8 +393,8 @@ static Subtree ts_parser__lex(
return NULL_SUBTREE;
}
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
const Length start_position = ts_stack_position(self->stack, version);
const Subtree external_token = ts_stack_last_external_token(self->stack, version);
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
@ -408,6 +408,8 @@ static Subtree ts_parser__lex(
Length error_start_position = length_zero();
Length error_end_position = length_zero();
uint32_t lookahead_end_byte = 0;
uint32_t external_scanner_state_len = 0;
bool external_scanner_state_changed = false;
ts_lexer_reset(&self->lexer, start_position);
for (;;) {
@ -429,22 +431,36 @@ static Subtree ts_parser__lex(
);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
// Zero-length external tokens are generally allowed, but they're not
// allowed right after a syntax error. This is for two reasons:
// 1. After a syntax error, the lexer is looking for any possible token,
// as opposed to the specific set of tokens that are valid in some
// parse state. In this situation, it's very easy for an external
// scanner to produce unwanted zero-length tokens.
// 2. The parser sometimes inserts *missing* tokens to recover from
// errors. These tokens are also zero-length. If we allow more
// zero-length tokens to be created after missing tokens, it
// can lead to infinite loops. Forbidding zero-length tokens
// right at the point of error recovery is a conservative strategy
// for preventing this kind of infinite loop.
if (found_token && (
self->lexer.token_end_position.bytes > current_position.bytes ||
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
)) {
if (found_token) {
external_scanner_state_len = self->language->external_scanner.serialize(
self->external_scanner_payload,
self->lexer.debug_buffer
);
external_scanner_state_changed = !ts_external_scanner_state_eq(
ts_subtree_external_scanner_state(external_token),
self->lexer.debug_buffer,
external_scanner_state_len
);
// When recovering from an error, ignore any zero-length external tokens
// unless they have changed the external scanner's state. This helps to
// avoid infinite loops which could otherwise occur, because the lexer is
// looking for any possible token, instead of looking for the specific set of
// tokens that are valid in some parse state.
if (
self->lexer.token_end_position.bytes == current_position.bytes &&
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) &&
!external_scanner_state_changed
) {
LOG(
"ignore_empty_external_token symbol:%s",
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
)
found_token = false;
}
}
if (found_token) {
found_external_token = true;
called_get_column = self->lexer.did_get_column;
break;
@ -508,11 +524,6 @@ static Subtree ts_parser__lex(
parse_state,
self->language
);
LOG_LOOKAHEAD(
SYM_NAME(ts_subtree_symbol(result)),
ts_subtree_total_size(result).bytes
);
} else {
if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) {
self->lexer.token_start_position = self->lexer.token_end_position;
@ -554,23 +565,20 @@ static Subtree ts_parser__lex(
);
if (found_external_token) {
unsigned length = self->language->external_scanner.serialize(
self->external_scanner_payload,
self->lexer.debug_buffer
);
MutableSubtree mut_result = ts_subtree_to_mut_unsafe(result);
ts_external_scanner_state_init(
&((SubtreeHeapData *)result.ptr)->external_scanner_state,
&mut_result.ptr->external_scanner_state,
self->lexer.debug_buffer,
length
external_scanner_state_len
);
mut_result.ptr->has_external_scanner_state_change = external_scanner_state_changed;
}
LOG_LOOKAHEAD(
SYM_NAME(ts_subtree_symbol(result)),
ts_subtree_total_size(result).bytes
);
}
LOG_LOOKAHEAD(
SYM_NAME(ts_subtree_symbol(result)),
ts_subtree_total_size(result).bytes
);
return result;
}
@ -1205,6 +1213,15 @@ static void ts_parser__recover(
return;
}
if (
did_recover &&
ts_subtree_has_external_scanner_state_change(lookahead)
) {
ts_stack_halt(self->stack, version);
ts_subtree_release(&self->tree_pool, lookahead);
return;
}
// If the parser is still in the error state at the end of the file, just wrap everything
// in an ERROR node and terminate.
if (ts_subtree_is_eof(lookahead)) {
@ -1539,6 +1556,13 @@ static bool ts_parser__advance(
continue;
}
// A non-terminal extra rule was reduced and merged into an existing
// stack version. This version can be discarded.
if (!lookahead.ptr) {
ts_stack_halt(self->stack, version);
return true;
}
// If there were no parse actions for the current lookahead token, then
// it is not valid in this state. If the current lookahead token is a
// keyword, then switch to treating it as the normal word token if that
@ -1928,6 +1952,7 @@ TSTree *ts_parser_parse(
}
} while (version_count != 0);
assert(self->finished_tree.ptr);
ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language);
LOG("done");
LOG_TREE(self->finished_tree);

View file

@ -846,11 +846,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f)
fprintf(f, "label=\"");
bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree);
if (quoted) fprintf(f, "'");
const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree));
for (const char *c = name; *c; c++) {
if (*c == '\"' || *c == '\\') fprintf(f, "\\");
fprintf(f, "%c", *c);
}
ts_language_write_symbol_as_dot_string(language, f, ts_subtree_symbol(link.subtree));
if (quoted) fprintf(f, "'");
fprintf(f, "\"");
fprintf(

View file

@ -21,8 +21,6 @@ typedef struct {
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
#define TS_MAX_TREE_POOL_SIZE 32
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
// ExternalScannerState
void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) {
@ -58,11 +56,10 @@ const char *ts_external_scanner_state_data(const ExternalScannerState *self) {
}
}
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) {
return a == b || (
a->length == b->length &&
!memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length)
);
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *buffer, unsigned length) {
return
a->length == length &&
memcmp(ts_external_scanner_state_data(a), buffer, length) == 0;
}
// SubtreeArray
@ -214,6 +211,7 @@ Subtree ts_subtree_new_leaf(
.fragile_right = false,
.has_changes = false,
.has_external_tokens = has_external_tokens,
.has_external_scanner_state_change = false,
.depends_on_column = depends_on_column,
.is_missing = false,
.is_keyword = is_keyword,
@ -381,6 +379,7 @@ void ts_subtree_summarize_children(
self.ptr->node_count = 1;
self.ptr->has_external_tokens = false;
self.ptr->depends_on_column = false;
self.ptr->has_external_scanner_state_change = false;
self.ptr->dynamic_precedence = 0;
uint32_t structural_index = 0;
@ -398,6 +397,10 @@ void ts_subtree_summarize_children(
self.ptr->depends_on_column = true;
}
if (ts_subtree_has_external_scanner_state_change(child)) {
self.ptr->has_external_scanner_state_change = true;
}
if (i == 0) {
self.ptr->padding = ts_subtree_padding(child);
self.ptr->size = ts_subtree_size(child);
@ -521,6 +524,7 @@ MutableSubtree ts_subtree_new_node(
.visible = metadata.visible,
.named = metadata.named,
.has_changes = false,
.has_external_scanner_state_change = false,
.fragile_left = fragile,
.fragile_right = fragile,
.is_keyword = false,
@ -830,18 +834,6 @@ static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
return snprintf(s, n, "%d", c);
}
static void ts_subtree__write_dot_string(FILE *f, const char *string) {
for (const char *c = string; *c; c++) {
if (*c == '"') {
fputs("\\\"", f);
} else if (*c == '\n') {
fputs("\\n", f);
} else {
fputc(*c, f);
}
}
}
static const char *ROOT_FIELD = "__ROOT__";
static size_t ts_subtree__write_to_string(
@ -971,7 +963,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset,
TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol;
uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self);
fprintf(f, "tree_%p [label=\"", (void *)self);
ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol));
ts_language_write_symbol_as_dot_string(language, f, symbol);
fprintf(f, "\"");
if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext");
@ -1024,14 +1016,26 @@ void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *
fprintf(f, "}\n");
}
bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) {
const ExternalScannerState *state1 = &empty_state;
const ExternalScannerState *state2 = &empty_state;
if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) {
state1 = &self.ptr->external_scanner_state;
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self) {
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
if (
self.ptr &&
!self.data.is_inline &&
self.ptr->has_external_tokens &&
self.ptr->child_count == 0
) {
return &self.ptr->external_scanner_state;
} else {
return &empty_state;
}
if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) {
state2 = &other.ptr->external_scanner_state;
}
return ts_external_scanner_state_eq(state1, state2);
}
bool ts_subtree_external_scanner_state_eq(Subtree a, Subtree b) {
const ExternalScannerState *state_a = ts_subtree_external_scanner_state(a);
const ExternalScannerState *state_b = ts_subtree_external_scanner_state(b);
return ts_external_scanner_state_eq(
state_a,
ts_external_scanner_state_data(state_b),
state_b->length
);
}

View file

@ -114,7 +114,7 @@ typedef struct {
Length size;
uint32_t lookahead_bytes;
uint32_t error_cost;
uint32_t child_count;
uint16_t child_count;
TSSymbol symbol;
TSStateId parse_state;
@ -125,6 +125,7 @@ typedef struct {
bool fragile_right : 1;
bool has_changes : 1;
bool has_external_tokens : 1;
bool has_external_scanner_state_change : 1;
bool depends_on_column: 1;
bool is_missing : 1;
bool is_keyword : 1;
@ -135,8 +136,8 @@ typedef struct {
uint32_t visible_child_count;
uint32_t named_child_count;
uint32_t node_count;
uint32_t repeat_depth;
int32_t dynamic_precedence;
uint16_t repeat_depth;
uint16_t production_id;
struct {
TSSymbol symbol;
@ -174,6 +175,8 @@ typedef struct {
void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned);
const char *ts_external_scanner_state_data(const ExternalScannerState *);
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *, unsigned);
void ts_external_scanner_state_delete(ExternalScannerState *self);
void ts_subtree_array_copy(SubtreeArray, SubtreeArray *);
void ts_subtree_array_clear(SubtreePool *, SubtreeArray *);
@ -206,6 +209,7 @@ Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *);
char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all);
void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *);
Subtree ts_subtree_last_external_token(Subtree);
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self);
bool ts_subtree_external_scanner_state_eq(Subtree, Subtree);
#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name)
@ -331,6 +335,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_tokens;
}
static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change;
}
static inline bool ts_subtree_depends_on_column(Subtree self) {
return self.data.is_inline ? false : self.ptr->depends_on_column;
}

View file

@ -14,11 +14,9 @@ OPTIONS
-a Compile C code with the Clang static analyzer
-l Run only the corpus tests for the given language
-e Run only the corpus tests whose name contain the given string
-t Run only the given trial number of randomized test
-i Run the given number of iterations of randomized tests (default 10)
-s Set the seed used to control random behavior
@ -36,7 +34,7 @@ export RUST_BACKTRACE=full
mode=normal
test_flags=""
while getopts "adDghl:e:s:t:" option; do
while getopts "adDghl:e:s:i:" option; do
case ${option} in
h)
usage
@ -62,6 +60,9 @@ while getopts "adDghl:e:s:t:" option; do
s)
export TREE_SITTER_SEED=${OPTARG}
;;
i)
export TREE_SITTER_ITERATIONS=${OPTARG}
;;
d)
export TREE_SITTER_LOG=1
;;

View file

@ -1,29 +1,111 @@
==========================================
errors in if statements
==========================================
=============================================
incomplete condition in if statement
=============================================
if a is:
print b
print c
print b
print c
print d
---
(module
(if_statement (identifier) (ERROR) (block
(print_statement (identifier))
(print_statement (identifier)))))
(if_statement
condition: (identifier)
(ERROR)
consequence: (block
(print_statement argument: (identifier))
(print_statement argument: (identifier))))
(print_statement argument: (identifier)))
==========================================
errors in function definitions
extra colon in function definition
==========================================
def a()::
b
c
d
---
(module
(function_definition (identifier) (parameters) (ERROR) (block
(expression_statement (identifier))
(expression_statement (identifier)))))
(function_definition
name: (identifier)
parameters: (parameters)
(ERROR)
body: (block
(expression_statement (identifier))
(expression_statement (identifier))))
(expression_statement (identifier)))
========================================================
stray if keyword in function definition
========================================================
def a():
if
---
(module
(function_definition
name: (identifier)
parameters: (parameters)
(ERROR)
body: (block)))
========================================================
incomplete if statement in function definition
========================================================
def a():
if a
---
(module
(function_definition
name: (identifier)
parameters: (parameters)
(ERROR (identifier))
body: (block)))
========================================================
incomplete expression before triple-quoted string
========================================================
def a():
b.
"""
c
"""
---
(module
(function_definition
name: (identifier)
parameters: (parameters)
(ERROR (identifier))
body: (block
(expression_statement (string)))))
===========================================
incomplete definition in class definition
===========================================
class A:
def
b
---
(module
(class_definition
name: (identifier)
(ERROR)
body: (block))
(expression_statement
(identifier)))