Start work on error recovery
- In runtime, make parse errors part of the parse tree - Add error state to lexers in which they can accept any token
This commit is contained in:
parent
4520d6e1a2
commit
e58a6d8ba7
18 changed files with 622 additions and 528 deletions
|
|
@ -14,105 +14,111 @@ namespace tree_sitter {
|
|||
using rules::CharacterSet;
|
||||
|
||||
namespace build_tables {
|
||||
static int NOT_FOUND = -1;
|
||||
static int NOT_FOUND = -2;
|
||||
static Symbol START("start", rules::SymbolTypeAuxiliary);
|
||||
static Symbol END_OF_INPUT("end", rules::SymbolTypeAuxiliary);
|
||||
|
||||
class TableBuilder {
|
||||
const PreparedGrammar grammar;
|
||||
const PreparedGrammar lex_grammar;
|
||||
map<const ParseItemSet, size_t> parse_state_indices;
|
||||
map<const LexItemSet, size_t> lex_state_indices;
|
||||
map<const ParseItemSet, ParseStateId> parse_state_ids;
|
||||
map<const LexItemSet, LexStateId> lex_state_ids;
|
||||
ParseTable parse_table;
|
||||
LexTable lex_table;
|
||||
|
||||
long parse_state_index_for_item_set(const ParseItemSet &item_set) const {
|
||||
auto entry = parse_state_indices.find(item_set);
|
||||
return (entry == parse_state_indices.end()) ? NOT_FOUND : entry->second;
|
||||
long parse_state_id_for_item_set(const ParseItemSet &item_set) const {
|
||||
auto entry = parse_state_ids.find(item_set);
|
||||
return (entry == parse_state_ids.end()) ? NOT_FOUND : entry->second;
|
||||
}
|
||||
|
||||
long lex_state_index_for_item_set(const LexItemSet &item_set) const {
|
||||
auto entry = lex_state_indices.find(item_set);
|
||||
return (entry == lex_state_indices.end()) ? NOT_FOUND : entry->second;
|
||||
long lex_state_id_for_item_set(const LexItemSet &item_set) const {
|
||||
auto entry = lex_state_ids.find(item_set);
|
||||
return (entry == lex_state_ids.end()) ? NOT_FOUND : entry->second;
|
||||
}
|
||||
|
||||
void add_shift_actions(const ParseItemSet &item_set, size_t state_index) {
|
||||
void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
for (auto transition : sym_transitions(item_set, grammar)) {
|
||||
Symbol symbol = transition.first;
|
||||
ParseItemSet item_set = transition.second;
|
||||
size_t new_state_index = add_parse_state(item_set);
|
||||
parse_table.add_action(state_index, symbol, ParseAction::Shift(new_state_index));
|
||||
ParseStateId new_state_id = add_parse_state(item_set);
|
||||
parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id));
|
||||
}
|
||||
}
|
||||
|
||||
void add_advance_actions(const LexItemSet &item_set, size_t state_index) {
|
||||
void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
|
||||
for (auto transition : char_transitions(item_set, grammar)) {
|
||||
CharacterSet rule = transition.first;
|
||||
LexItemSet item_set = transition.second;
|
||||
size_t new_state_index = add_lex_state(item_set);
|
||||
lex_table.add_action(state_index, rule, LexAction::Advance(new_state_index));
|
||||
LexStateId new_state_id = add_lex_state(item_set);
|
||||
lex_table.add_action(state_id, rule, LexAction::Advance(new_state_id));
|
||||
}
|
||||
}
|
||||
|
||||
void add_accept_token_actions(const LexItemSet &item_set, size_t state_index) {
|
||||
void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) {
|
||||
for (LexItem item : item_set) {
|
||||
if (item.is_done()) {
|
||||
lex_table.add_default_action(state_index, LexAction::Accept(item.lhs));
|
||||
lex_table.add_default_action(state_id, LexAction::Accept(item.lhs));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_reduce_actions(const ParseItemSet &item_set, size_t state_index) {
|
||||
void add_reduce_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
for (ParseItem item : item_set) {
|
||||
if (item.is_done()) {
|
||||
ParseAction action = (item.lhs == START) ?
|
||||
ParseAction::Accept() :
|
||||
ParseAction::Reduce(item.lhs, item.consumed_symbols);
|
||||
parse_table.add_action(state_index, item.lookahead_sym, action);
|
||||
parse_table.add_action(state_id, item.lookahead_sym, action);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void assign_lex_state(size_t state_index) {
|
||||
ParseState &state = parse_table.states[state_index];
|
||||
void assign_lex_state(ParseStateId state_id) {
|
||||
ParseState &state = parse_table.states[state_id];
|
||||
LexItemSet item_set;
|
||||
for (auto &symbol : state.expected_inputs()) {
|
||||
if (symbol == END_OF_INPUT)
|
||||
item_set.insert(LexItem(symbol, make_shared<CharacterSet>(std::set<rules::CharacterRange>{ '\0' })));
|
||||
if (lex_grammar.has_definition(symbol))
|
||||
item_set.insert(LexItem(symbol, lex_grammar.rule(symbol)));
|
||||
}
|
||||
|
||||
state.lex_state_index = add_lex_state(item_set);
|
||||
state.lex_state_id = add_lex_state(item_set);
|
||||
}
|
||||
|
||||
size_t add_lex_state(const LexItemSet &item_set) {
|
||||
auto state_index = lex_state_index_for_item_set(item_set);
|
||||
if (state_index == NOT_FOUND) {
|
||||
state_index = lex_table.add_state();
|
||||
lex_state_indices[item_set] = state_index;
|
||||
add_advance_actions(item_set, state_index);
|
||||
add_accept_token_actions(item_set, state_index);
|
||||
LexStateId add_lex_state(const LexItemSet &item_set) {
|
||||
auto state_id = lex_state_id_for_item_set(item_set);
|
||||
if (state_id == NOT_FOUND) {
|
||||
state_id = lex_table.add_state();
|
||||
lex_state_ids[item_set] = state_id;
|
||||
add_advance_actions(item_set, state_id);
|
||||
add_accept_token_actions(item_set, state_id);
|
||||
}
|
||||
return state_index;
|
||||
return state_id;
|
||||
}
|
||||
|
||||
size_t add_parse_state(const ParseItemSet &item_set) {
|
||||
auto state_index = parse_state_index_for_item_set(item_set);
|
||||
if (state_index == NOT_FOUND) {
|
||||
state_index = parse_table.add_state();
|
||||
parse_state_indices[item_set] = state_index;
|
||||
ParseStateId add_parse_state(const ParseItemSet &item_set) {
|
||||
auto state_id = parse_state_id_for_item_set(item_set);
|
||||
if (state_id == NOT_FOUND) {
|
||||
state_id = parse_table.add_state();
|
||||
parse_state_ids[item_set] = state_id;
|
||||
|
||||
add_shift_actions(item_set, state_index);
|
||||
add_reduce_actions(item_set, state_index);
|
||||
assign_lex_state(state_index);
|
||||
add_shift_actions(item_set, state_id);
|
||||
add_reduce_actions(item_set, state_id);
|
||||
assign_lex_state(state_id);
|
||||
}
|
||||
return state_index;
|
||||
return state_id;
|
||||
}
|
||||
|
||||
void add_error_lex_state() {
|
||||
LexItemSet error_item_set;
|
||||
for (auto &pair : lex_grammar.rules)
|
||||
error_item_set.insert(LexItem(pair.first, pair.second));
|
||||
add_advance_actions(error_item_set, LexTable::ERROR_STATE_ID);
|
||||
add_accept_token_actions(error_item_set, LexTable::ERROR_STATE_ID);
|
||||
}
|
||||
|
||||
// void dump_item_sets() {
|
||||
// std::vector<const ParseItemSet *> item_sets(parse_state_indices.size());
|
||||
// for (auto &pair : parse_state_indices)
|
||||
// std::vector<const ParseItemSet *> item_sets(parse_state_ids.size());
|
||||
// for (auto &pair : parse_state_ids)
|
||||
// item_sets[pair.second] = &pair.first;
|
||||
//
|
||||
// for (int i = 0; i < item_sets.size(); i++) {
|
||||
|
|
@ -135,6 +141,7 @@ namespace tree_sitter {
|
|||
auto item = ParseItem(START, make_shared<Symbol>(grammar.start_rule_name), {}, END_OF_INPUT);
|
||||
ParseItemSet item_set = item_set_closure(ParseItemSet({ item }), grammar);
|
||||
add_parse_state(item_set);
|
||||
add_error_lex_state();
|
||||
return pair<ParseTable, LexTable>(parse_table, lex_table);
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -164,17 +164,13 @@ namespace tree_sitter {
|
|||
return input;
|
||||
}
|
||||
|
||||
string lex_error_call(const set<rules::CharacterSet> &expected_inputs) {
|
||||
rules::CharacterSet expected_set;
|
||||
for (auto &rule : expected_inputs)
|
||||
expected_set.add_set(rule);
|
||||
|
||||
string result = "LEX_ERROR(" + to_string(expected_set.ranges.size()) + ", EXPECT({";
|
||||
string parse_error_call(const set<rules::Symbol> &expected_inputs) {
|
||||
string result = "PARSE_ERROR(" + to_string(expected_inputs.size()) + ", EXPECT({";
|
||||
bool started = false;
|
||||
for (auto &range : expected_set.ranges) {
|
||||
for (auto &symbol : expected_inputs) {
|
||||
if (started) result += ", ";
|
||||
started = true;
|
||||
result += "\"" + escape_string(range.to_string()) + "\"";
|
||||
result += symbol_id(symbol);
|
||||
}
|
||||
result += "}));";
|
||||
return result;
|
||||
|
|
@ -183,7 +179,7 @@ namespace tree_sitter {
|
|||
string code_for_lex_actions(const set<LexAction> &actions, const set<rules::CharacterSet> &expected_inputs) {
|
||||
auto action = actions.begin();
|
||||
if (action == actions.end()) {
|
||||
return lex_error_call(expected_inputs);
|
||||
return "LEX_ERROR();";
|
||||
} else {
|
||||
switch (action->type) {
|
||||
case LexActionTypeAdvance:
|
||||
|
|
@ -198,11 +194,12 @@ namespace tree_sitter {
|
|||
|
||||
string code_for_parse_state(const ParseState &parse_state) {
|
||||
string body = "";
|
||||
auto expected_inputs = parse_state.expected_inputs();
|
||||
for (auto pair : parse_state.actions)
|
||||
body += _case(symbol_id(pair.first), code_for_parse_actions(pair.second, parse_state.expected_inputs()));
|
||||
body += _default("PARSE_PANIC();");
|
||||
body += _case(symbol_id(pair.first), code_for_parse_actions(pair.second, expected_inputs));
|
||||
body += _default(parse_error_call(expected_inputs));
|
||||
return
|
||||
string("SET_LEX_STATE(") + to_string(parse_state.lex_state_index) + ");\n" +
|
||||
string("SET_LEX_STATE(") + to_string(parse_state.lex_state_id) + ");\n" +
|
||||
_switch("LOOKAHEAD_SYM()", body);
|
||||
}
|
||||
|
||||
|
|
@ -227,6 +224,7 @@ namespace tree_sitter {
|
|||
string body = "";
|
||||
for (int i = 0; i < lex_table.states.size(); i++)
|
||||
body += _case(std::to_string(i), switch_on_lookahead_char(lex_table.states[i]));
|
||||
body += _case("ts_lex_state_error", switch_on_lookahead_char(lex_table.error_state));
|
||||
body += _default("LEX_PANIC();");
|
||||
return _switch("LEX_STATE()", body);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,16 +58,25 @@ namespace tree_sitter {
|
|||
return result;
|
||||
}
|
||||
|
||||
size_t LexTable::add_state() {
|
||||
LexStateId LexTable::add_state() {
|
||||
states.push_back(LexState());
|
||||
return states.size() - 1;
|
||||
}
|
||||
|
||||
void LexTable::add_action(size_t state_index, CharacterSet match, LexAction action) {
|
||||
states[state_index].actions[match].insert(action);
|
||||
LexState & state(LexTable *table, LexStateId id) {
|
||||
if (id < 0)
|
||||
return table->error_state;
|
||||
else
|
||||
return table->states[id];
|
||||
}
|
||||
|
||||
void LexTable::add_default_action(size_t state_index, LexAction action) {
|
||||
states[state_index].default_actions.insert(action);
|
||||
void LexTable::add_action(LexStateId id, CharacterSet match, LexAction action) {
|
||||
state(this, id).actions[match].insert(action);
|
||||
}
|
||||
|
||||
void LexTable::add_default_action(LexStateId id, LexAction action) {
|
||||
state(this, id).default_actions.insert(action);
|
||||
}
|
||||
|
||||
const LexStateId LexTable::ERROR_STATE_ID = -1;
|
||||
}
|
||||
|
|
@ -51,13 +51,17 @@ namespace tree_sitter {
|
|||
std::set<rules::CharacterSet> expected_inputs() const;
|
||||
};
|
||||
|
||||
typedef long int LexStateId;
|
||||
|
||||
class LexTable {
|
||||
public:
|
||||
size_t add_state();
|
||||
void add_action(size_t state_index, rules::CharacterSet rule, LexAction action);
|
||||
void add_default_action(size_t state_index, LexAction action);
|
||||
static const LexStateId ERROR_STATE_ID;
|
||||
LexStateId add_state();
|
||||
void add_action(LexStateId state_id, rules::CharacterSet rule, LexAction action);
|
||||
void add_default_action(LexStateId state_id, LexAction action);
|
||||
|
||||
std::vector<LexState> states;
|
||||
LexState error_state;
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ namespace tree_sitter {
|
|||
}
|
||||
}
|
||||
|
||||
ParseState::ParseState() : lex_state_index(-1) {}
|
||||
ParseState::ParseState() : lex_state_id(-1) {}
|
||||
|
||||
set<Symbol> ParseState::expected_inputs() const {
|
||||
set<Symbol> result;
|
||||
|
|
@ -86,13 +86,13 @@ namespace tree_sitter {
|
|||
return stream;
|
||||
}
|
||||
|
||||
size_t ParseTable::add_state() {
|
||||
ParseStateId ParseTable::add_state() {
|
||||
states.push_back(ParseState());
|
||||
return states.size() - 1;
|
||||
}
|
||||
|
||||
void ParseTable::add_action(size_t state_index, Symbol symbol, ParseAction action) {
|
||||
void ParseTable::add_action(ParseStateId id, Symbol symbol, ParseAction action) {
|
||||
symbols.insert(symbol);
|
||||
states[state_index].actions[symbol].insert(action);
|
||||
states[id].actions[symbol].insert(action);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <vector>
|
||||
#include <set>
|
||||
#include "rules/symbol.h"
|
||||
#include "./lex_table.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
typedef enum {
|
||||
|
|
@ -52,15 +53,17 @@ namespace tree_sitter {
|
|||
ParseState();
|
||||
std::map<rules::Symbol, std::set<ParseAction>> actions;
|
||||
std::set<rules::Symbol> expected_inputs() const;
|
||||
size_t lex_state_index;
|
||||
LexStateId lex_state_id;
|
||||
};
|
||||
|
||||
typedef unsigned long int ParseStateId;
|
||||
|
||||
std::ostream& operator<<(std::ostream &stream, const ParseState &state);
|
||||
|
||||
class ParseTable {
|
||||
public:
|
||||
size_t add_state();
|
||||
void add_action(size_t state_index, rules::Symbol symbol, ParseAction action);
|
||||
void add_action(ParseStateId state_id, rules::Symbol symbol, ParseAction action);
|
||||
|
||||
std::vector<ParseState> states;
|
||||
std::set<rules::Symbol> symbols;
|
||||
|
|
|
|||
|
|
@ -3,8 +3,9 @@
|
|||
struct ts_document {
|
||||
ts_parse_fn *parse_fn;
|
||||
const char **symbol_names;
|
||||
ts_error error;
|
||||
ts_tree *tree;
|
||||
const ts_tree *tree;
|
||||
size_t error_count;
|
||||
ts_tree **errors;
|
||||
};
|
||||
|
||||
ts_document * ts_document_make() {
|
||||
|
|
@ -21,18 +22,18 @@ void ts_document_set_parser(ts_document *document, ts_parse_config config) {
|
|||
}
|
||||
|
||||
void ts_document_set_text(ts_document *document, const char *text) {
|
||||
ts_parse_result result = document->parse_fn(text);
|
||||
document->tree = result.tree;
|
||||
document->error = result.error;
|
||||
const ts_tree * result = document->parse_fn(text);
|
||||
document->tree = result;
|
||||
document->errors = NULL;
|
||||
}
|
||||
|
||||
ts_tree * ts_document_tree(const ts_document *document) {
|
||||
const ts_tree * ts_document_tree(const ts_document *document) {
|
||||
return document->tree;
|
||||
}
|
||||
|
||||
const char * ts_document_string(const ts_document *document) {
|
||||
if (document->error.expected_inputs != NULL) {
|
||||
return ts_error_string(&document->error);
|
||||
if (document->error_count > 0) {
|
||||
return ts_tree_error_string(document->errors[0], document->symbol_names);
|
||||
} else {
|
||||
return ts_tree_string(document->tree, document->symbol_names);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
#include "tree_sitter/runtime.h"
|
||||
#include <string>
|
||||
#include "string.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
const char * ts_error_string(const ts_error *error) {
|
||||
string result = string("Unexpected character '") + error->lookahead_char + "'. Expected:";
|
||||
for (int i = 0; i < error->expected_input_count; i++) {
|
||||
result += string(" ") + error->expected_inputs[i];
|
||||
}
|
||||
|
||||
char *stuff = (char *)malloc(result.size() * sizeof(char));
|
||||
strcpy(stuff, result.c_str());
|
||||
return stuff;
|
||||
}
|
||||
|
|
@ -3,18 +3,43 @@
|
|||
#include <string.h>
|
||||
|
||||
using std::string;
|
||||
using std::to_string;
|
||||
|
||||
ts_tree * ts_tree_make(ts_symbol value, size_t child_count, ts_tree **children) {
|
||||
const ts_symbol ts_symbol_error = -1;
|
||||
|
||||
ts_tree * ts_tree_make_leaf(ts_symbol symbol) {
|
||||
ts_tree *result = new ts_tree();
|
||||
result->value = value;
|
||||
result->child_count = child_count;
|
||||
result->children = children;
|
||||
result->ref_count = 0;
|
||||
result->symbol = symbol;
|
||||
result->data.children = {
|
||||
.count = 0,
|
||||
.contents = NULL
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
ts_tree * ts_tree_make_node(ts_symbol symbol, size_t child_count, ts_tree **children) {
|
||||
ts_tree *result = new ts_tree();
|
||||
result->ref_count = 0;
|
||||
result->symbol = symbol;
|
||||
result->data.children = {
|
||||
.count = child_count,
|
||||
.contents = children
|
||||
};
|
||||
for (int i = 0; i < child_count; i++)
|
||||
ts_tree_retain(children[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
ts_tree * ts_tree_make_error(char lookahead_char, size_t expected_input_count, const ts_symbol *expected_inputs) {
|
||||
ts_tree *result = new ts_tree();
|
||||
result->data.error = {
|
||||
.lookahead_char = lookahead_char,
|
||||
.expected_input_count = expected_input_count,
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
void ts_tree_retain(ts_tree *tree) {
|
||||
tree->ref_count++;
|
||||
}
|
||||
|
|
@ -22,28 +47,46 @@ void ts_tree_retain(ts_tree *tree) {
|
|||
void ts_tree_release(ts_tree *tree) {
|
||||
tree->ref_count--;
|
||||
if (tree->ref_count == 0) {
|
||||
for (int i = 0; i < tree->child_count; i++)
|
||||
ts_tree_release(tree->children[i]);
|
||||
ts_tree **children = tree->data.children.contents;
|
||||
for (int i = 0; i < ts_tree_child_count(tree); i++)
|
||||
ts_tree_release(children[i]);
|
||||
// free(children);
|
||||
free(tree);
|
||||
}
|
||||
}
|
||||
|
||||
int ts_tree_equals(const ts_tree *node1, const ts_tree *node2) {
|
||||
if (node1->value != node2->value) return 0;
|
||||
if (node1->child_count != node2->child_count) return 0;
|
||||
for (int i = 0; i < node1->child_count; i++) {
|
||||
ts_tree *child1 = node1->children[i];
|
||||
ts_tree *child2 = node2->children[i];
|
||||
if (!ts_tree_equals(child1, child2)) return 0;
|
||||
if (node1->symbol != node2->symbol) return 0;
|
||||
if (node1->symbol == ts_symbol_error) {
|
||||
// check error equality
|
||||
} else {
|
||||
if (node1->data.children.count != node2->data.children.count)
|
||||
return 0;
|
||||
for (int i = 0; i < node1->data.children.count; i++) {
|
||||
ts_tree *child1 = node1->data.children.contents[i];
|
||||
ts_tree *child2 = node2->data.children.contents[i];
|
||||
if (!ts_tree_equals(child1, child2))
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
ts_tree ** ts_tree_children(const ts_tree *tree) {
|
||||
if (tree->symbol == ts_symbol_error) return NULL;
|
||||
return tree->data.children.contents;
|
||||
}
|
||||
|
||||
size_t ts_tree_child_count(const ts_tree *tree) {
|
||||
if (tree->symbol == ts_symbol_error) return 0;
|
||||
return tree->data.children.count;
|
||||
}
|
||||
|
||||
static string __tree_to_string(const ts_tree *tree, const char **symbol_names) {
|
||||
if (!tree) return "#<null-tree>";
|
||||
string result = string("(") + symbol_names[tree->value];
|
||||
for (int i = 0; i < tree->child_count; i++)
|
||||
result += " " + __tree_to_string(tree->children[i], symbol_names);
|
||||
string result = string("(") + symbol_names[tree->symbol];
|
||||
for (int i = 0; i < tree->data.children.count; i++)
|
||||
result += " " + __tree_to_string(tree->data.children.contents[i], symbol_names);
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
|
|
@ -53,3 +96,15 @@ char * ts_tree_string(const ts_tree *tree, const char **symbol_names) {
|
|||
strcpy(result, value.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
char * ts_tree_error_string(const ts_tree *tree, const char **symbol_names) {
|
||||
string result = string("Unexpected character '") + tree->data.error.lookahead_char + "'. Expected:";
|
||||
for (int i = 0; i < tree->data.error.expected_input_count; i++) {
|
||||
ts_symbol symbol = tree->data.error.expected_inputs[i];
|
||||
result += string(" ") + symbol_names[symbol];
|
||||
}
|
||||
|
||||
char *stuff = (char *)malloc(result.size() * sizeof(char));
|
||||
strcpy(stuff, result.c_str());
|
||||
return stuff;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue