Merge pull request #1668 from tree-sitter/remember-lookahead-bytes-on-error-detection
Remember lookahead bytes on error detection
This commit is contained in:
commit
a494d6aa28
5 changed files with 165 additions and 114 deletions
|
|
@ -1,12 +1,17 @@
|
|||
use super::helpers::{
|
||||
allocations,
|
||||
edits::invert_edit,
|
||||
edits::ReadRecorder,
|
||||
fixtures::{get_language, get_test_grammar, get_test_language},
|
||||
};
|
||||
use crate::generate::generate_parser_for_grammar;
|
||||
use crate::parse::{perform_edit, Edit};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::{thread, time};
|
||||
use crate::{
|
||||
generate::generate_parser_for_grammar,
|
||||
parse::{perform_edit, Edit},
|
||||
};
|
||||
use std::{
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
thread, time,
|
||||
};
|
||||
use tree_sitter::{IncludedRangesError, InputEdit, LogType, Parser, Point, Range};
|
||||
|
||||
#[test]
|
||||
|
|
@ -491,6 +496,44 @@ h + i
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_after_detecting_error_in_the_middle_of_a_string_token() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(get_language("python")).unwrap();
|
||||
|
||||
let mut source = b"a = b, 'c, d'".to_vec();
|
||||
let tree = parser.parse(&source, None).unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
"(module (expression_statement (assignment left: (identifier) right: (expression_list (identifier) (string)))))"
|
||||
);
|
||||
|
||||
// Delete a suffix of the source code, starting in the middle of the string
|
||||
// literal, after some whitespace. With this deletion, the remaining string
|
||||
// content: "c, " looks like two valid python tokens: an identifier and a comma.
|
||||
// When this edit is undone, in order correctly recover the orginal tree, the
|
||||
// parser needs to remember that before matching the `c` as an identifier, it
|
||||
// lookahead ahead several bytes, trying to find the closing quotation mark in
|
||||
// order to match the "string content" node.
|
||||
let edit_ix = std::str::from_utf8(&source).unwrap().find("d'").unwrap();
|
||||
let edit = Edit {
|
||||
position: edit_ix,
|
||||
deleted_length: source.len() - edit_ix,
|
||||
inserted_text: Vec::new(),
|
||||
};
|
||||
let undo = invert_edit(&source, &edit);
|
||||
|
||||
let mut tree2 = tree.clone();
|
||||
perform_edit(&mut tree2, &mut source, &edit);
|
||||
tree2 = parser.parse(&source, Some(&tree2)).unwrap();
|
||||
assert!(tree2.root_node().has_error());
|
||||
|
||||
let mut tree3 = tree2.clone();
|
||||
perform_edit(&mut tree3, &mut source, &undo);
|
||||
tree3 = parser.parse(&source, Some(&tree3)).unwrap();
|
||||
assert_eq!(tree3.root_node().to_sexp(), tree.root_node().to_sexp(),);
|
||||
}
|
||||
|
||||
// Thread safety
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
199
lib/src/parser.c
199
lib/src/parser.c
|
|
@ -1060,87 +1060,6 @@ static bool ts_parser__do_all_potential_reductions(
|
|||
return can_shift_lookahead_symbol;
|
||||
}
|
||||
|
||||
static void ts_parser__handle_error(
|
||||
TSParser *self,
|
||||
StackVersion version,
|
||||
TSSymbol lookahead_symbol
|
||||
) {
|
||||
uint32_t previous_version_count = ts_stack_version_count(self->stack);
|
||||
|
||||
// Perform any reductions that can happen in this state, regardless of the lookahead. After
|
||||
// skipping one or more invalid tokens, the parser might find a token that would have allowed
|
||||
// a reduction to take place.
|
||||
ts_parser__do_all_potential_reductions(self, version, 0);
|
||||
uint32_t version_count = ts_stack_version_count(self->stack);
|
||||
Length position = ts_stack_position(self->stack, version);
|
||||
|
||||
// Push a discontinuity onto the stack. Merge all of the stack versions that
|
||||
// were created in the previous step.
|
||||
bool did_insert_missing_token = false;
|
||||
for (StackVersion v = version; v < version_count;) {
|
||||
if (!did_insert_missing_token) {
|
||||
TSStateId state = ts_stack_state(self->stack, v);
|
||||
for (TSSymbol missing_symbol = 1;
|
||||
missing_symbol < self->language->token_count;
|
||||
missing_symbol++) {
|
||||
TSStateId state_after_missing_symbol = ts_language_next_state(
|
||||
self->language, state, missing_symbol
|
||||
);
|
||||
if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ts_language_has_reduce_action(
|
||||
self->language,
|
||||
state_after_missing_symbol,
|
||||
lookahead_symbol
|
||||
)) {
|
||||
// In case the parser is currently outside of any included range, the lexer will
|
||||
// snap to the beginning of the next included range. The missing token's padding
|
||||
// must be assigned to position it within the next included range.
|
||||
ts_lexer_reset(&self->lexer, position);
|
||||
ts_lexer_mark_end(&self->lexer);
|
||||
Length padding = length_sub(self->lexer.token_end_position, position);
|
||||
|
||||
StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v);
|
||||
Subtree missing_tree = ts_subtree_new_missing_leaf(
|
||||
&self->tree_pool, missing_symbol, padding, self->language
|
||||
);
|
||||
ts_stack_push(
|
||||
self->stack, version_with_missing_tree,
|
||||
missing_tree, false,
|
||||
state_after_missing_symbol
|
||||
);
|
||||
|
||||
if (ts_parser__do_all_potential_reductions(
|
||||
self, version_with_missing_tree,
|
||||
lookahead_symbol
|
||||
)) {
|
||||
LOG(
|
||||
"recover_with_missing symbol:%s, state:%u",
|
||||
SYM_NAME(missing_symbol),
|
||||
ts_stack_state(self->stack, version_with_missing_tree)
|
||||
);
|
||||
did_insert_missing_token = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE);
|
||||
v = (v == version) ? previous_version_count : v + 1;
|
||||
}
|
||||
|
||||
for (unsigned i = previous_version_count; i < version_count; i++) {
|
||||
bool did_merge = ts_stack_merge(self->stack, version, previous_version_count);
|
||||
assert(did_merge);
|
||||
}
|
||||
|
||||
ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH);
|
||||
LOG_STACK();
|
||||
}
|
||||
|
||||
static bool ts_parser__recover_to_state(
|
||||
TSParser *self,
|
||||
StackVersion version,
|
||||
|
|
@ -1368,6 +1287,98 @@ static void ts_parser__recover(
|
|||
}
|
||||
}
|
||||
|
||||
static void ts_parser__handle_error(
|
||||
TSParser *self,
|
||||
StackVersion version,
|
||||
Subtree lookahead
|
||||
) {
|
||||
uint32_t previous_version_count = ts_stack_version_count(self->stack);
|
||||
|
||||
// Perform any reductions that can happen in this state, regardless of the lookahead. After
|
||||
// skipping one or more invalid tokens, the parser might find a token that would have allowed
|
||||
// a reduction to take place.
|
||||
ts_parser__do_all_potential_reductions(self, version, 0);
|
||||
uint32_t version_count = ts_stack_version_count(self->stack);
|
||||
Length position = ts_stack_position(self->stack, version);
|
||||
|
||||
// Push a discontinuity onto the stack. Merge all of the stack versions that
|
||||
// were created in the previous step.
|
||||
bool did_insert_missing_token = false;
|
||||
for (StackVersion v = version; v < version_count;) {
|
||||
if (!did_insert_missing_token) {
|
||||
TSStateId state = ts_stack_state(self->stack, v);
|
||||
for (TSSymbol missing_symbol = 1;
|
||||
missing_symbol < self->language->token_count;
|
||||
missing_symbol++) {
|
||||
TSStateId state_after_missing_symbol = ts_language_next_state(
|
||||
self->language, state, missing_symbol
|
||||
);
|
||||
if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ts_language_has_reduce_action(
|
||||
self->language,
|
||||
state_after_missing_symbol,
|
||||
ts_subtree_leaf_symbol(lookahead)
|
||||
)) {
|
||||
// In case the parser is currently outside of any included range, the lexer will
|
||||
// snap to the beginning of the next included range. The missing token's padding
|
||||
// must be assigned to position it within the next included range.
|
||||
ts_lexer_reset(&self->lexer, position);
|
||||
ts_lexer_mark_end(&self->lexer);
|
||||
Length padding = length_sub(self->lexer.token_end_position, position);
|
||||
|
||||
StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v);
|
||||
Subtree missing_tree = ts_subtree_new_missing_leaf(
|
||||
&self->tree_pool, missing_symbol, padding, self->language
|
||||
);
|
||||
ts_stack_push(
|
||||
self->stack, version_with_missing_tree,
|
||||
missing_tree, false,
|
||||
state_after_missing_symbol
|
||||
);
|
||||
|
||||
if (ts_parser__do_all_potential_reductions(
|
||||
self, version_with_missing_tree,
|
||||
ts_subtree_leaf_symbol(lookahead)
|
||||
)) {
|
||||
LOG(
|
||||
"recover_with_missing symbol:%s, state:%u",
|
||||
SYM_NAME(missing_symbol),
|
||||
ts_stack_state(self->stack, version_with_missing_tree)
|
||||
);
|
||||
did_insert_missing_token = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE);
|
||||
v = (v == version) ? previous_version_count : v + 1;
|
||||
}
|
||||
|
||||
for (unsigned i = previous_version_count; i < version_count; i++) {
|
||||
bool did_merge = ts_stack_merge(self->stack, version, previous_version_count);
|
||||
assert(did_merge);
|
||||
}
|
||||
|
||||
ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH);
|
||||
|
||||
// Begin recovery with the current lookahead node, rather than waiting for the
|
||||
// next turn of the parse loop. This ensures that the tree accounts for the the
|
||||
// current lookahead token's "lookahead bytes" value, which describes how far
|
||||
// the lexer needed to look ahead beyond the content of the token in order to
|
||||
// recognize it.
|
||||
if (ts_subtree_child_count(lookahead) > 0) {
|
||||
ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node);
|
||||
}
|
||||
ts_parser__recover(self, version, lookahead);
|
||||
|
||||
LOG_STACK();
|
||||
}
|
||||
|
||||
static bool ts_parser__advance(
|
||||
TSParser *self,
|
||||
StackVersion version,
|
||||
|
|
@ -1510,23 +1521,18 @@ static bool ts_parser__advance(
|
|||
// on the current parse state.
|
||||
if (!lookahead.ptr) {
|
||||
needs_lex = true;
|
||||
continue;
|
||||
} else {
|
||||
ts_language_table_entry(
|
||||
self->language,
|
||||
state,
|
||||
ts_subtree_leaf_symbol(lookahead),
|
||||
&table_entry
|
||||
);
|
||||
}
|
||||
|
||||
ts_language_table_entry(
|
||||
self->language,
|
||||
state,
|
||||
ts_subtree_leaf_symbol(lookahead),
|
||||
&table_entry
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!lookahead.ptr) {
|
||||
ts_stack_pause(self->stack, version, ts_builtin_sym_end);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If there were no parse actions for the current lookahead token, then
|
||||
// it is not valid in this state. If the current lookahead token is a
|
||||
// keyword, then switch to treating it as the normal word token if that
|
||||
|
|
@ -1576,8 +1582,7 @@ static bool ts_parser__advance(
|
|||
// version advances successfully, then this version can simply be removed.
|
||||
// But if all versions end up paused, then error recovery is needed.
|
||||
LOG("detect_error");
|
||||
ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead));
|
||||
ts_subtree_release(&self->tree_pool, lookahead);
|
||||
ts_stack_pause(self->stack, version, lookahead);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1660,8 +1665,8 @@ static unsigned ts_parser__condense_stack(TSParser *self) {
|
|||
if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) {
|
||||
LOG("resume version:%u", i);
|
||||
min_error_cost = ts_stack_error_cost(self->stack, i);
|
||||
TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i);
|
||||
ts_parser__handle_error(self, i, lookahead_symbol);
|
||||
Subtree lookahead = ts_stack_resume(self->stack, i);
|
||||
ts_parser__handle_error(self, i, lookahead);
|
||||
has_unpaused_version = true;
|
||||
} else {
|
||||
ts_stack_remove_version(self->stack, i);
|
||||
|
|
|
|||
|
|
@ -53,10 +53,10 @@ typedef enum {
|
|||
|
||||
typedef struct {
|
||||
StackNode *node;
|
||||
Subtree last_external_token;
|
||||
StackSummary *summary;
|
||||
unsigned node_count_at_last_error;
|
||||
TSSymbol lookahead_when_paused;
|
||||
Subtree last_external_token;
|
||||
Subtree lookahead_when_paused;
|
||||
StackStatus status;
|
||||
} StackHead;
|
||||
|
||||
|
|
@ -256,6 +256,9 @@ static void stack_head_delete(
|
|||
if (self->last_external_token.ptr) {
|
||||
ts_subtree_release(subtree_pool, self->last_external_token);
|
||||
}
|
||||
if (self->lookahead_when_paused.ptr) {
|
||||
ts_subtree_release(subtree_pool, self->lookahead_when_paused);
|
||||
}
|
||||
if (self->summary) {
|
||||
array_delete(self->summary);
|
||||
ts_free(self->summary);
|
||||
|
|
@ -274,7 +277,7 @@ static StackVersion ts_stack__add_version(
|
|||
.node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error,
|
||||
.last_external_token = self->heads.contents[original_version].last_external_token,
|
||||
.status = StackStatusActive,
|
||||
.lookahead_when_paused = 0,
|
||||
.lookahead_when_paused = NULL_SUBTREE,
|
||||
};
|
||||
array_push(&self->heads, head);
|
||||
stack_node_retain(node);
|
||||
|
|
@ -703,7 +706,7 @@ void ts_stack_halt(Stack *self, StackVersion version) {
|
|||
array_get(&self->heads, version)->status = StackStatusHalted;
|
||||
}
|
||||
|
||||
void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) {
|
||||
void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead) {
|
||||
StackHead *head = array_get(&self->heads, version);
|
||||
head->status = StackStatusPaused;
|
||||
head->lookahead_when_paused = lookahead;
|
||||
|
|
@ -722,12 +725,12 @@ bool ts_stack_is_paused(const Stack *self, StackVersion version) {
|
|||
return array_get(&self->heads, version)->status == StackStatusPaused;
|
||||
}
|
||||
|
||||
TSSymbol ts_stack_resume(Stack *self, StackVersion version) {
|
||||
Subtree ts_stack_resume(Stack *self, StackVersion version) {
|
||||
StackHead *head = array_get(&self->heads, version);
|
||||
assert(head->status == StackStatusPaused);
|
||||
TSSymbol result = head->lookahead_when_paused;
|
||||
Subtree result = head->lookahead_when_paused;
|
||||
head->status = StackStatusActive;
|
||||
head->lookahead_when_paused = 0;
|
||||
head->lookahead_when_paused = NULL_SUBTREE;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -739,9 +742,9 @@ void ts_stack_clear(Stack *self) {
|
|||
array_clear(&self->heads);
|
||||
array_push(&self->heads, ((StackHead) {
|
||||
.node = self->base_node,
|
||||
.last_external_token = NULL_SUBTREE,
|
||||
.status = StackStatusActive,
|
||||
.lookahead_when_paused = 0,
|
||||
.last_external_token = NULL_SUBTREE,
|
||||
.lookahead_when_paused = NULL_SUBTREE,
|
||||
}));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -99,9 +99,9 @@ bool ts_stack_merge(Stack *, StackVersion, StackVersion);
|
|||
// Determine whether the given two stack versions can be merged.
|
||||
bool ts_stack_can_merge(Stack *, StackVersion, StackVersion);
|
||||
|
||||
TSSymbol ts_stack_resume(Stack *, StackVersion);
|
||||
Subtree ts_stack_resume(Stack *, StackVersion);
|
||||
|
||||
void ts_stack_pause(Stack *, StackVersion, TSSymbol);
|
||||
void ts_stack_pause(Stack *, StackVersion, Subtree);
|
||||
|
||||
void ts_stack_halt(Stack *, StackVersion);
|
||||
|
||||
|
|
|
|||
4
test/fixtures/error_corpus/c_errors.txt
vendored
4
test/fixtures/error_corpus/c_errors.txt
vendored
|
|
@ -128,8 +128,8 @@ int main() {
|
|||
(declaration (primitive_type) (init_declarator
|
||||
(identifier)
|
||||
(parenthesized_expression
|
||||
(number_literal)
|
||||
(ERROR (number_literal))))))))
|
||||
(ERROR (number_literal))
|
||||
(number_literal)))))))
|
||||
|
||||
========================================
|
||||
Extra identifiers in declarations
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue