Merge pull request #1668 from tree-sitter/remember-lookahead-bytes-on-error-detection

Remember lookahead bytes on error detection
This commit is contained in:
Max Brunsfeld 2022-02-26 14:59:40 -08:00 committed by GitHub
commit a494d6aa28
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 165 additions and 114 deletions

View file

@ -1,12 +1,17 @@
use super::helpers::{
allocations,
edits::invert_edit,
edits::ReadRecorder,
fixtures::{get_language, get_test_grammar, get_test_language},
};
use crate::generate::generate_parser_for_grammar;
use crate::parse::{perform_edit, Edit};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::{thread, time};
use crate::{
generate::generate_parser_for_grammar,
parse::{perform_edit, Edit},
};
use std::{
sync::atomic::{AtomicUsize, Ordering},
thread, time,
};
use tree_sitter::{IncludedRangesError, InputEdit, LogType, Parser, Point, Range};
#[test]
@ -491,6 +496,44 @@ h + i
);
}
#[test]
fn test_parsing_after_detecting_error_in_the_middle_of_a_string_token() {
let mut parser = Parser::new();
parser.set_language(get_language("python")).unwrap();
let mut source = b"a = b, 'c, d'".to_vec();
let tree = parser.parse(&source, None).unwrap();
assert_eq!(
tree.root_node().to_sexp(),
"(module (expression_statement (assignment left: (identifier) right: (expression_list (identifier) (string)))))"
);
// Delete a suffix of the source code, starting in the middle of the string
// literal, after some whitespace. With this deletion, the remaining string
// content: "c, " looks like two valid python tokens: an identifier and a comma.
// When this edit is undone, in order correctly recover the orginal tree, the
// parser needs to remember that before matching the `c` as an identifier, it
// lookahead ahead several bytes, trying to find the closing quotation mark in
// order to match the "string content" node.
let edit_ix = std::str::from_utf8(&source).unwrap().find("d'").unwrap();
let edit = Edit {
position: edit_ix,
deleted_length: source.len() - edit_ix,
inserted_text: Vec::new(),
};
let undo = invert_edit(&source, &edit);
let mut tree2 = tree.clone();
perform_edit(&mut tree2, &mut source, &edit);
tree2 = parser.parse(&source, Some(&tree2)).unwrap();
assert!(tree2.root_node().has_error());
let mut tree3 = tree2.clone();
perform_edit(&mut tree3, &mut source, &undo);
tree3 = parser.parse(&source, Some(&tree3)).unwrap();
assert_eq!(tree3.root_node().to_sexp(), tree.root_node().to_sexp(),);
}
// Thread safety
#[test]

View file

@ -1060,87 +1060,6 @@ static bool ts_parser__do_all_potential_reductions(
return can_shift_lookahead_symbol;
}
static void ts_parser__handle_error(
TSParser *self,
StackVersion version,
TSSymbol lookahead_symbol
) {
uint32_t previous_version_count = ts_stack_version_count(self->stack);
// Perform any reductions that can happen in this state, regardless of the lookahead. After
// skipping one or more invalid tokens, the parser might find a token that would have allowed
// a reduction to take place.
ts_parser__do_all_potential_reductions(self, version, 0);
uint32_t version_count = ts_stack_version_count(self->stack);
Length position = ts_stack_position(self->stack, version);
// Push a discontinuity onto the stack. Merge all of the stack versions that
// were created in the previous step.
bool did_insert_missing_token = false;
for (StackVersion v = version; v < version_count;) {
if (!did_insert_missing_token) {
TSStateId state = ts_stack_state(self->stack, v);
for (TSSymbol missing_symbol = 1;
missing_symbol < self->language->token_count;
missing_symbol++) {
TSStateId state_after_missing_symbol = ts_language_next_state(
self->language, state, missing_symbol
);
if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) {
continue;
}
if (ts_language_has_reduce_action(
self->language,
state_after_missing_symbol,
lookahead_symbol
)) {
// In case the parser is currently outside of any included range, the lexer will
// snap to the beginning of the next included range. The missing token's padding
// must be assigned to position it within the next included range.
ts_lexer_reset(&self->lexer, position);
ts_lexer_mark_end(&self->lexer);
Length padding = length_sub(self->lexer.token_end_position, position);
StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v);
Subtree missing_tree = ts_subtree_new_missing_leaf(
&self->tree_pool, missing_symbol, padding, self->language
);
ts_stack_push(
self->stack, version_with_missing_tree,
missing_tree, false,
state_after_missing_symbol
);
if (ts_parser__do_all_potential_reductions(
self, version_with_missing_tree,
lookahead_symbol
)) {
LOG(
"recover_with_missing symbol:%s, state:%u",
SYM_NAME(missing_symbol),
ts_stack_state(self->stack, version_with_missing_tree)
);
did_insert_missing_token = true;
break;
}
}
}
}
ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE);
v = (v == version) ? previous_version_count : v + 1;
}
for (unsigned i = previous_version_count; i < version_count; i++) {
bool did_merge = ts_stack_merge(self->stack, version, previous_version_count);
assert(did_merge);
}
ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH);
LOG_STACK();
}
static bool ts_parser__recover_to_state(
TSParser *self,
StackVersion version,
@ -1368,6 +1287,98 @@ static void ts_parser__recover(
}
}
static void ts_parser__handle_error(
TSParser *self,
StackVersion version,
Subtree lookahead
) {
uint32_t previous_version_count = ts_stack_version_count(self->stack);
// Perform any reductions that can happen in this state, regardless of the lookahead. After
// skipping one or more invalid tokens, the parser might find a token that would have allowed
// a reduction to take place.
ts_parser__do_all_potential_reductions(self, version, 0);
uint32_t version_count = ts_stack_version_count(self->stack);
Length position = ts_stack_position(self->stack, version);
// Push a discontinuity onto the stack. Merge all of the stack versions that
// were created in the previous step.
bool did_insert_missing_token = false;
for (StackVersion v = version; v < version_count;) {
if (!did_insert_missing_token) {
TSStateId state = ts_stack_state(self->stack, v);
for (TSSymbol missing_symbol = 1;
missing_symbol < self->language->token_count;
missing_symbol++) {
TSStateId state_after_missing_symbol = ts_language_next_state(
self->language, state, missing_symbol
);
if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) {
continue;
}
if (ts_language_has_reduce_action(
self->language,
state_after_missing_symbol,
ts_subtree_leaf_symbol(lookahead)
)) {
// In case the parser is currently outside of any included range, the lexer will
// snap to the beginning of the next included range. The missing token's padding
// must be assigned to position it within the next included range.
ts_lexer_reset(&self->lexer, position);
ts_lexer_mark_end(&self->lexer);
Length padding = length_sub(self->lexer.token_end_position, position);
StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v);
Subtree missing_tree = ts_subtree_new_missing_leaf(
&self->tree_pool, missing_symbol, padding, self->language
);
ts_stack_push(
self->stack, version_with_missing_tree,
missing_tree, false,
state_after_missing_symbol
);
if (ts_parser__do_all_potential_reductions(
self, version_with_missing_tree,
ts_subtree_leaf_symbol(lookahead)
)) {
LOG(
"recover_with_missing symbol:%s, state:%u",
SYM_NAME(missing_symbol),
ts_stack_state(self->stack, version_with_missing_tree)
);
did_insert_missing_token = true;
break;
}
}
}
}
ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE);
v = (v == version) ? previous_version_count : v + 1;
}
for (unsigned i = previous_version_count; i < version_count; i++) {
bool did_merge = ts_stack_merge(self->stack, version, previous_version_count);
assert(did_merge);
}
ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH);
// Begin recovery with the current lookahead node, rather than waiting for the
// next turn of the parse loop. This ensures that the tree accounts for the the
// current lookahead token's "lookahead bytes" value, which describes how far
// the lexer needed to look ahead beyond the content of the token in order to
// recognize it.
if (ts_subtree_child_count(lookahead) > 0) {
ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node);
}
ts_parser__recover(self, version, lookahead);
LOG_STACK();
}
static bool ts_parser__advance(
TSParser *self,
StackVersion version,
@ -1510,23 +1521,18 @@ static bool ts_parser__advance(
// on the current parse state.
if (!lookahead.ptr) {
needs_lex = true;
continue;
} else {
ts_language_table_entry(
self->language,
state,
ts_subtree_leaf_symbol(lookahead),
&table_entry
);
}
ts_language_table_entry(
self->language,
state,
ts_subtree_leaf_symbol(lookahead),
&table_entry
);
continue;
}
if (!lookahead.ptr) {
ts_stack_pause(self->stack, version, ts_builtin_sym_end);
return true;
}
// If there were no parse actions for the current lookahead token, then
// it is not valid in this state. If the current lookahead token is a
// keyword, then switch to treating it as the normal word token if that
@ -1576,8 +1582,7 @@ static bool ts_parser__advance(
// version advances successfully, then this version can simply be removed.
// But if all versions end up paused, then error recovery is needed.
LOG("detect_error");
ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead));
ts_subtree_release(&self->tree_pool, lookahead);
ts_stack_pause(self->stack, version, lookahead);
return true;
}
}
@ -1660,8 +1665,8 @@ static unsigned ts_parser__condense_stack(TSParser *self) {
if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) {
LOG("resume version:%u", i);
min_error_cost = ts_stack_error_cost(self->stack, i);
TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i);
ts_parser__handle_error(self, i, lookahead_symbol);
Subtree lookahead = ts_stack_resume(self->stack, i);
ts_parser__handle_error(self, i, lookahead);
has_unpaused_version = true;
} else {
ts_stack_remove_version(self->stack, i);

View file

@ -53,10 +53,10 @@ typedef enum {
typedef struct {
StackNode *node;
Subtree last_external_token;
StackSummary *summary;
unsigned node_count_at_last_error;
TSSymbol lookahead_when_paused;
Subtree last_external_token;
Subtree lookahead_when_paused;
StackStatus status;
} StackHead;
@ -256,6 +256,9 @@ static void stack_head_delete(
if (self->last_external_token.ptr) {
ts_subtree_release(subtree_pool, self->last_external_token);
}
if (self->lookahead_when_paused.ptr) {
ts_subtree_release(subtree_pool, self->lookahead_when_paused);
}
if (self->summary) {
array_delete(self->summary);
ts_free(self->summary);
@ -274,7 +277,7 @@ static StackVersion ts_stack__add_version(
.node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error,
.last_external_token = self->heads.contents[original_version].last_external_token,
.status = StackStatusActive,
.lookahead_when_paused = 0,
.lookahead_when_paused = NULL_SUBTREE,
};
array_push(&self->heads, head);
stack_node_retain(node);
@ -703,7 +706,7 @@ void ts_stack_halt(Stack *self, StackVersion version) {
array_get(&self->heads, version)->status = StackStatusHalted;
}
void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) {
void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead) {
StackHead *head = array_get(&self->heads, version);
head->status = StackStatusPaused;
head->lookahead_when_paused = lookahead;
@ -722,12 +725,12 @@ bool ts_stack_is_paused(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusPaused;
}
TSSymbol ts_stack_resume(Stack *self, StackVersion version) {
Subtree ts_stack_resume(Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
assert(head->status == StackStatusPaused);
TSSymbol result = head->lookahead_when_paused;
Subtree result = head->lookahead_when_paused;
head->status = StackStatusActive;
head->lookahead_when_paused = 0;
head->lookahead_when_paused = NULL_SUBTREE;
return result;
}
@ -739,9 +742,9 @@ void ts_stack_clear(Stack *self) {
array_clear(&self->heads);
array_push(&self->heads, ((StackHead) {
.node = self->base_node,
.last_external_token = NULL_SUBTREE,
.status = StackStatusActive,
.lookahead_when_paused = 0,
.last_external_token = NULL_SUBTREE,
.lookahead_when_paused = NULL_SUBTREE,
}));
}

View file

@ -99,9 +99,9 @@ bool ts_stack_merge(Stack *, StackVersion, StackVersion);
// Determine whether the given two stack versions can be merged.
bool ts_stack_can_merge(Stack *, StackVersion, StackVersion);
TSSymbol ts_stack_resume(Stack *, StackVersion);
Subtree ts_stack_resume(Stack *, StackVersion);
void ts_stack_pause(Stack *, StackVersion, TSSymbol);
void ts_stack_pause(Stack *, StackVersion, Subtree);
void ts_stack_halt(Stack *, StackVersion);

View file

@ -128,8 +128,8 @@ int main() {
(declaration (primitive_type) (init_declarator
(identifier)
(parenthesized_expression
(number_literal)
(ERROR (number_literal))))))))
(ERROR (number_literal))
(number_literal)))))))
========================================
Extra identifiers in declarations