diff --git a/examples/grammars/json.hpp b/examples/grammars/json.hpp index cb194e7d..a57a2db7 100644 --- a/examples/grammars/json.hpp +++ b/examples/grammars/json.hpp @@ -26,10 +26,10 @@ namespace test_grammars { sym("number") }) }, { "object", seq({ _sym("left_brace"), - comma_sep(seq({ + comma_sep(err(seq({ sym("string"), _sym("colon"), - sym("value") })), + sym("value") }))), _sym("right_brace"), }) }, { "array", seq({ _sym("left_bracket"), diff --git a/examples/parsers/arithmetic.c b/examples/parsers/arithmetic.c index 7664dfa7..fd574ef8 100644 --- a/examples/parsers/arithmetic.c +++ b/examples/parsers/arithmetic.c @@ -26,6 +26,13 @@ SYMBOL_NAMES { "token2", }; +static const ts_symbol * ts_recover(ts_state state, ts_state *to_state, size_t *count) { + switch (state) { + default: + RECOVER(0, 0, EXPECT({})); + } +} + LEX_FN() { START_LEXER(); switch (LEX_STATE()) { diff --git a/examples/parsers/json.c b/examples/parsers/json.c index 402569ec..ce03de93 100644 --- a/examples/parsers/json.c +++ b/examples/parsers/json.c @@ -34,6 +34,21 @@ SYMBOL_NAMES { "repeat_helper2", }; +static const ts_symbol * ts_recover(ts_state state, ts_state *to_state, size_t *count) { + switch (state) { + case 3: + RECOVER(52, 2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + case 8: + RECOVER(47, 2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + case 13: + RECOVER(44, 2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + case 25: + RECOVER(32, 2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + default: + RECOVER(0, 0, EXPECT({})); + } +} + LEX_FN() { START_LEXER(); switch (LEX_STATE()) { @@ -224,7 +239,7 @@ PARSE_FN() { case ts_sym_left_brace: SHIFT(3); case ts_sym_left_bracket: - SHIFT(44); + SHIFT(55); default: PARSE_ERROR(7, EXPECT({ts_sym_array, ts_sym_number, ts_sym_object, ts_sym_string, ts_sym_value, ts_sym_left_brace, ts_sym_left_bracket})); } @@ -250,9 +265,11 @@ PARSE_FN() { case ts_sym_string: SHIFT(4); case ts_sym_right_brace: - SHIFT(43); + SHIFT(51); + case ts_builtin_sym_error: + SHIFT(52); default: - PARSE_ERROR(2, EXPECT({ts_sym_string, ts_sym_right_brace})); + PARSE_ERROR(3, EXPECT({ts_sym_string, ts_sym_right_brace, ts_builtin_sym_error})); } case 4: SET_LEX_STATE(19); @@ -300,7 +317,7 @@ PARSE_FN() { case ts_sym_right_brace: REDUCE(ts_aux_sym_repeat_helper2, 0, COLLAPSE({})); case ts_aux_sym_repeat_helper2: - SHIFT(41); + SHIFT(49); default: PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_brace, ts_aux_sym_repeat_helper2})); } @@ -309,8 +326,10 @@ PARSE_FN() { switch (LOOKAHEAD_SYM()) { case ts_sym_string: SHIFT(9); + case ts_builtin_sym_error: + SHIFT(47); default: - PARSE_ERROR(1, EXPECT({ts_sym_string})); + PARSE_ERROR(2, EXPECT({ts_sym_string, ts_builtin_sym_error})); } case 9: SET_LEX_STATE(19); @@ -366,9 +385,11 @@ PARSE_FN() { case ts_sym_string: SHIFT(14); case ts_sym_right_brace: - SHIFT(40); + SHIFT(43); + case ts_builtin_sym_error: + SHIFT(44); default: - PARSE_ERROR(2, EXPECT({ts_sym_string, ts_sym_right_brace})); + PARSE_ERROR(3, EXPECT({ts_sym_string, ts_sym_right_brace, ts_builtin_sym_error})); } case 14: SET_LEX_STATE(19); @@ -444,9 +465,9 @@ PARSE_FN() { case ts_sym_left_brace: SHIFT(25); case ts_sym_left_bracket: - SHIFT(32); + SHIFT(35); case ts_sym_right_bracket: - SHIFT(39); + SHIFT(42); default: PARSE_ERROR(8, EXPECT({ts_sym_array, ts_sym_number, ts_sym_object, ts_sym_string, ts_sym_value, ts_sym_left_brace, ts_sym_left_bracket, ts_sym_right_bracket})); } @@ -468,7 +489,7 @@ PARSE_FN() { case ts_sym_right_bracket: REDUCE(ts_aux_sym_repeat_helper1, 0, COLLAPSE({})); case ts_aux_sym_repeat_helper1: - SHIFT(37); + SHIFT(40); default: PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_bracket, ts_aux_sym_repeat_helper1})); } @@ -488,7 +509,7 @@ PARSE_FN() { case ts_sym_left_brace: SHIFT(25); case ts_sym_left_bracket: - SHIFT(32); + SHIFT(35); default: PARSE_ERROR(7, EXPECT({ts_sym_array, ts_sym_number, ts_sym_object, ts_sym_string, ts_sym_value, ts_sym_left_brace, ts_sym_left_bracket})); } @@ -519,8 +540,10 @@ PARSE_FN() { SHIFT(26); case ts_sym_right_brace: SHIFT(31); + case ts_builtin_sym_error: + SHIFT(32); default: - PARSE_ERROR(2, EXPECT({ts_sym_string, ts_sym_right_brace})); + PARSE_ERROR(3, EXPECT({ts_sym_string, ts_sym_right_brace, ts_builtin_sym_error})); } case 26: SET_LEX_STATE(19); @@ -591,6 +614,36 @@ PARSE_FN() { PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); } case 32: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + SHIFT(8); + case ts_sym_right_brace: + REDUCE(ts_aux_sym_repeat_helper2, 0, COLLAPSE({})); + case ts_aux_sym_repeat_helper2: + SHIFT(33); + default: + PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_brace, ts_aux_sym_repeat_helper2})); + } + case 33: + SET_LEX_STATE(5); + switch (LOOKAHEAD_SYM()) { + case ts_sym_right_brace: + SHIFT(34); + default: + PARSE_ERROR(1, EXPECT({ts_sym_right_brace})); + } + case 34: + SET_LEX_STATE(6); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + REDUCE(ts_sym_object, 4, COLLAPSE({1, 0, 1, 1})); + case ts_sym_right_bracket: + REDUCE(ts_sym_object, 4, COLLAPSE({1, 0, 1, 1})); + default: + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); + } + case 35: SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_sym_array: @@ -602,17 +655,17 @@ PARSE_FN() { case ts_sym_string: SHIFT(20); case ts_sym_value: - SHIFT(33); + SHIFT(36); case ts_sym_left_brace: SHIFT(25); case ts_sym_left_bracket: - SHIFT(32); + SHIFT(35); case ts_sym_right_bracket: - SHIFT(36); + SHIFT(39); default: PARSE_ERROR(8, EXPECT({ts_sym_array, ts_sym_number, ts_sym_object, ts_sym_string, ts_sym_value, ts_sym_left_brace, ts_sym_left_bracket, ts_sym_right_bracket})); } - case 33: + case 36: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_sym_comma: @@ -620,38 +673,10 @@ PARSE_FN() { case ts_sym_right_bracket: REDUCE(ts_aux_sym_repeat_helper1, 0, COLLAPSE({})); case ts_aux_sym_repeat_helper1: - SHIFT(34); + SHIFT(37); default: PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_bracket, ts_aux_sym_repeat_helper1})); } - case 34: - SET_LEX_STATE(8); - switch (LOOKAHEAD_SYM()) { - case ts_sym_right_bracket: - SHIFT(35); - default: - PARSE_ERROR(1, EXPECT({ts_sym_right_bracket})); - } - case 35: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_sym_comma: - REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); - case ts_sym_right_bracket: - REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); - default: - PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); - } - case 36: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_sym_comma: - REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); - case ts_sym_right_bracket: - REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); - default: - PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); - } case 37: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { @@ -661,26 +686,54 @@ PARSE_FN() { PARSE_ERROR(1, EXPECT({ts_sym_right_bracket})); } case 38: - SET_LEX_STATE(2); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_sym_comma: REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); - case ts_sym_right_brace: + case ts_sym_right_bracket: REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); default: - PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); } case 39: - SET_LEX_STATE(2); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_sym_comma: REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); - case ts_sym_right_brace: + case ts_sym_right_bracket: REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); default: - PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_bracket})); } case 40: + SET_LEX_STATE(8); + switch (LOOKAHEAD_SYM()) { + case ts_sym_right_bracket: + SHIFT(41); + default: + PARSE_ERROR(1, EXPECT({ts_sym_right_bracket})); + } + case 41: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); + case ts_sym_right_brace: + REDUCE(ts_sym_array, 4, COLLAPSE({1, 0, 1, 1})); + default: + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + } + case 42: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); + case ts_sym_right_brace: + REDUCE(ts_sym_array, 2, COLLAPSE({1, 1})); + default: + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + } + case 43: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_sym_comma: @@ -690,15 +743,65 @@ PARSE_FN() { default: PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); } - case 41: + case 44: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + SHIFT(8); + case ts_sym_right_brace: + REDUCE(ts_aux_sym_repeat_helper2, 0, COLLAPSE({})); + case ts_aux_sym_repeat_helper2: + SHIFT(45); + default: + PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_brace, ts_aux_sym_repeat_helper2})); + } + case 45: SET_LEX_STATE(5); switch (LOOKAHEAD_SYM()) { case ts_sym_right_brace: - SHIFT(42); + SHIFT(46); default: PARSE_ERROR(1, EXPECT({ts_sym_right_brace})); } - case 42: + case 46: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + REDUCE(ts_sym_object, 4, COLLAPSE({1, 0, 1, 1})); + case ts_sym_right_brace: + REDUCE(ts_sym_object, 4, COLLAPSE({1, 0, 1, 1})); + default: + PARSE_ERROR(2, EXPECT({ts_sym_comma, ts_sym_right_brace})); + } + case 47: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + SHIFT(8); + case ts_sym_right_brace: + REDUCE(ts_aux_sym_repeat_helper2, 0, COLLAPSE({})); + case ts_aux_sym_repeat_helper2: + SHIFT(48); + default: + PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_brace, ts_aux_sym_repeat_helper2})); + } + case 48: + SET_LEX_STATE(5); + switch (LOOKAHEAD_SYM()) { + case ts_sym_right_brace: + REDUCE(ts_aux_sym_repeat_helper2, 3, COLLAPSE({1, 0, 1})); + default: + PARSE_ERROR(1, EXPECT({ts_sym_right_brace})); + } + case 49: + SET_LEX_STATE(5); + switch (LOOKAHEAD_SYM()) { + case ts_sym_right_brace: + SHIFT(50); + default: + PARSE_ERROR(1, EXPECT({ts_sym_right_brace})); + } + case 50: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_sym_end: @@ -706,7 +809,7 @@ PARSE_FN() { default: PARSE_ERROR(1, EXPECT({ts_aux_sym_end})); } - case 43: + case 51: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_sym_end: @@ -714,7 +817,35 @@ PARSE_FN() { default: PARSE_ERROR(1, EXPECT({ts_aux_sym_end})); } - case 44: + case 52: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_sym_comma: + SHIFT(8); + case ts_sym_right_brace: + REDUCE(ts_aux_sym_repeat_helper2, 0, COLLAPSE({})); + case ts_aux_sym_repeat_helper2: + SHIFT(53); + default: + PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_brace, ts_aux_sym_repeat_helper2})); + } + case 53: + SET_LEX_STATE(5); + switch (LOOKAHEAD_SYM()) { + case ts_sym_right_brace: + SHIFT(54); + default: + PARSE_ERROR(1, EXPECT({ts_sym_right_brace})); + } + case 54: + SET_LEX_STATE(0); + switch (LOOKAHEAD_SYM()) { + case ts_aux_sym_end: + REDUCE(ts_sym_object, 4, COLLAPSE({1, 0, 1, 1})); + default: + PARSE_ERROR(1, EXPECT({ts_aux_sym_end})); + } + case 55: SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_sym_array: @@ -726,17 +857,17 @@ PARSE_FN() { case ts_sym_string: SHIFT(20); case ts_sym_value: - SHIFT(45); + SHIFT(56); case ts_sym_left_brace: SHIFT(25); case ts_sym_left_bracket: - SHIFT(32); + SHIFT(35); case ts_sym_right_bracket: - SHIFT(48); + SHIFT(59); default: PARSE_ERROR(8, EXPECT({ts_sym_array, ts_sym_number, ts_sym_object, ts_sym_string, ts_sym_value, ts_sym_left_brace, ts_sym_left_bracket, ts_sym_right_bracket})); } - case 45: + case 56: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_sym_comma: @@ -744,19 +875,19 @@ PARSE_FN() { case ts_sym_right_bracket: REDUCE(ts_aux_sym_repeat_helper1, 0, COLLAPSE({})); case ts_aux_sym_repeat_helper1: - SHIFT(46); + SHIFT(57); default: PARSE_ERROR(3, EXPECT({ts_sym_comma, ts_sym_right_bracket, ts_aux_sym_repeat_helper1})); } - case 46: + case 57: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_sym_right_bracket: - SHIFT(47); + SHIFT(58); default: PARSE_ERROR(1, EXPECT({ts_sym_right_bracket})); } - case 47: + case 58: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_sym_end: @@ -764,7 +895,7 @@ PARSE_FN() { default: PARSE_ERROR(1, EXPECT({ts_aux_sym_end})); } - case 48: + case 59: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_sym_end: diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index 6f0821ac..8f317a4c 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -21,6 +21,7 @@ namespace tree_sitter { rule_ptr _sym(const std::string &name); rule_ptr pattern(const std::string &value); rule_ptr str(const std::string &value); + rule_ptr err(const rule_ptr &rule); } class Grammar { diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 8b3dba7b..4c6b5d82 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -48,6 +48,7 @@ typedef struct { } ts_parser; static void ts_lex(ts_parser *parser); +static const ts_symbol * ts_recover(ts_state state, ts_state *to_state, size_t *count); static ts_parser ts_parser_make(const char *input) { ts_parser result = { @@ -67,22 +68,26 @@ static char ts_parser_lookahead_char(const ts_parser *parser) { static ts_symbol ts_parser_lookahead_sym(const ts_parser *parser) { ts_tree *node = parser->lookahead_node; - return node ? node->symbol : ts_symbol_error; + return node ? node->symbol : ts_builtin_sym_error; } static ts_state ts_parser_parse_state(const ts_parser *parser) { if (parser->stack_size == 0) return 0; return parser->stack[parser->stack_size - 1].state; } + +static void ts_parser_push(ts_parser *parser, ts_state state, ts_tree *node) { + ts_stack_entry *entry = (parser->stack + parser->stack_size); + entry->state = state; + entry->node = node; + parser->stack_size++; +} static void ts_parser_shift(ts_parser *parser, ts_state parse_state) { DEBUG_PARSE("shift: %d \n", parse_state); - ts_stack_entry *entry = (parser->stack + parser->stack_size); - entry->state = parse_state; - entry->node = parser->lookahead_node; + ts_parser_push(parser, parse_state, parser->lookahead_node); parser->lookahead_node = parser->prev_lookahead_node; parser->prev_lookahead_node = NULL; - parser->stack_size++; } static void ts_parser_reduce(ts_parser *parser, ts_symbol symbol, int immediate_child_count, const int *collapse_flags) { @@ -144,8 +149,29 @@ static void ts_parser_skip_whitespace(ts_parser *parser) { static int ts_parser_handle_error(ts_parser *parser, size_t count, const ts_symbol *expected_symbols) { parser->error_mode = 1; ts_tree *error = ts_tree_make_error(ts_parser_lookahead_char(parser), count, expected_symbols); - parser->stack[0].node = error; - return 0; + while (1) { + parser->lookahead_node = NULL; + parser->lex_state = ts_lex_state_error; + ts_lex(parser); + + for (long i = parser->stack_size - 1; i >= 0; i--) { + ts_state state = parser->stack[i].state; + ts_state to_state; + size_t count; + const ts_symbol *symbols = ts_recover(state, &to_state, &count); + for (int j = 0; j < count; j++) { + if (symbols[j] == ts_parser_lookahead_sym(parser)) { + parser->stack_size = i + 1; + ts_parser_push(parser, to_state, error); + return 1; + } + } + } + if (!ts_parser_lookahead_char(parser)) { + parser->stack[0].node = error; + return 0; + } + } } #pragma mark - DSL @@ -229,6 +255,14 @@ printf("Lex error: unexpected state %d", LEX_STATE()); #define PARSE_PANIC() \ printf("Parse error: unexpected state %d", PARSE_STATE()); +#define RECOVER(new_state, symbol_count, values) \ +{ \ + *count = symbol_count; \ + *to_state = new_state; \ + static ts_symbol symbols[] = values; \ + return symbols; \ +} + #define EXPECT(...) __VA_ARGS__ #define COLLAPSE(...) __VA_ARGS__ diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 3930769a..51c4880f 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -8,7 +8,7 @@ extern "C" { #include typedef int ts_symbol; -extern const ts_symbol ts_symbol_error; +static const ts_symbol ts_builtin_sym_error = -1; typedef struct ts_tree { ts_symbol symbol; diff --git a/spec/main.cpp b/spec/main.cpp index 1562a5fd..2d163e64 100644 --- a/spec/main.cpp +++ b/spec/main.cpp @@ -6,7 +6,7 @@ int main(int argc, char *argv[]) "", "--no-color", "--only=" - "", +// "reports errors inside of nested objects", }; return bandit::run(4, const_cast(args)); } \ No newline at end of file diff --git a/spec/runtime/json_spec.cpp b/spec/runtime/json_spec.cpp index 8296eab3..ab8e241a 100644 --- a/spec/runtime/json_spec.cpp +++ b/spec/runtime/json_spec.cpp @@ -53,9 +53,11 @@ describe("json", []() { it("reports errors in the top-level node", [&]() { ts_document_set_text(doc, "["); AssertThat(string(ts_document_string(doc)), Equals("(ERROR)")); - - ts_document_set_text(doc, "{ \"key1\": 1, "); - AssertThat(string(ts_document_string(doc)), Equals("(ERROR)")); + }); + + it("reports errors inside of nested objects", [&]() { + ts_document_set_text(doc, "{ \"key1\": 1, 5 }"); + AssertThat(string(ts_document_string(doc)), Equals("(value (object (string) (value (number)) (ERROR)))")); }); }); }); diff --git a/src/compiler/build_tables/first_set.cpp b/src/compiler/build_tables/first_set.cpp index bf06134e..5d4df4b6 100644 --- a/src/compiler/build_tables/first_set.cpp +++ b/src/compiler/build_tables/first_set.cpp @@ -11,17 +11,17 @@ namespace tree_sitter { using namespace rules; namespace build_tables { + set set_union(const set &left, const set &right) { + set result = left; + result.insert(right.begin(), right.end()); + return result; + } + class FirstSet : public RuleFn> { const PreparedGrammar grammar; public: FirstSet(const PreparedGrammar &grammar) : grammar(grammar) {} - set set_union(const set &left, const set &right) { - set result = left; - result.insert(right.begin(), right.end()); - return result; - } - void visit(const Symbol *rule) { if (grammar.has_definition(*rule)) { value = apply(grammar.rule(*rule)); @@ -46,5 +46,15 @@ namespace tree_sitter { set first_set(const rule_ptr &rule, const PreparedGrammar &grammar) { return FirstSet(grammar).apply(rule); } + + set first_set(const ParseItemSet &item_set, const PreparedGrammar &grammar) { + set result; + for (auto &item : item_set) { + result = set_union(result, first_set(item.rule, grammar)); + if (rule_can_be_blank(item.rule, grammar)) + result.insert(item.lookahead_sym); + } + return result; + } } } \ No newline at end of file diff --git a/src/compiler/build_tables/first_set.h b/src/compiler/build_tables/first_set.h index 860bf572..cab3cbbf 100644 --- a/src/compiler/build_tables/first_set.h +++ b/src/compiler/build_tables/first_set.h @@ -2,6 +2,7 @@ #define __tree_sitter__first_set__ #include "rules/symbol.h" +#include "item.h" #include namespace tree_sitter { @@ -20,7 +21,7 @@ namespace tree_sitter { * Returns the set of terminal symbols that can appear at * the beginning of any item in the given set. */ -// std::set first_set(const ParseItemSet &item_set, const PreparedGrammar &grammar); + std::set first_set(const ParseItemSet &item_set, const PreparedGrammar &grammar); } } diff --git a/src/compiler/build_tables/follow_sets.h b/src/compiler/build_tables/follow_sets.h index d715be3b..e3b46970 100644 --- a/src/compiler/build_tables/follow_sets.h +++ b/src/compiler/build_tables/follow_sets.h @@ -15,7 +15,7 @@ namespace tree_sitter { * Returns a map of non-terminal symbols to sets of terminal symbols. * The keys are the non-terminals which may appear first in the given * item. The values are the sets of terminals which can appear immediately - * after the corresponding non-terminals. + * after the corresponding non-terminals. */ std::map> follow_sets(const ParseItem &item, const PreparedGrammar &grammar); diff --git a/src/compiler/build_tables/perform.cpp b/src/compiler/build_tables/perform.cpp index ea338863..ef1a5244 100644 --- a/src/compiler/build_tables/perform.cpp +++ b/src/compiler/build_tables/perform.cpp @@ -4,6 +4,10 @@ #include "item_set_closure.h" #include "item_set_transitions.h" #include "tree_sitter/compiler.h" +#include "rules/built_in_symbols.h" +#include "first_set.h" + +#include "stream_methods.h" namespace tree_sitter { using std::pair; @@ -43,6 +47,10 @@ namespace tree_sitter { ParseItemSet item_set = transition.second; ParseStateId new_state_id = add_parse_state(item_set); parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id)); + + if (symbol == rules::ERROR) { + parse_table.error_table.insert({ state_id, { new_state_id, first_set(transition.second, grammar) } }); + } } } @@ -119,7 +127,7 @@ namespace tree_sitter { add_advance_actions(error_item_set, LexTable::ERROR_STATE_ID); add_accept_token_actions(error_item_set, LexTable::ERROR_STATE_ID); } - + // void dump_item_sets() { // std::vector item_sets(parse_state_ids.size()); // for (auto &pair : parse_state_ids) @@ -134,7 +142,7 @@ namespace tree_sitter { // } // } // } - + public: TableBuilder(const PreparedGrammar &grammar, const PreparedGrammar &lex_grammar) : diff --git a/src/compiler/generate_code/c_code.cpp b/src/compiler/generate_code/c_code.cpp index fa6a9584..c36c148c 100644 --- a/src/compiler/generate_code/c_code.cpp +++ b/src/compiler/generate_code/c_code.cpp @@ -2,6 +2,7 @@ #include #include #include +#include "built_in_symbols.h" namespace tree_sitter { using std::string; @@ -85,7 +86,12 @@ namespace tree_sitter { {} string symbol_id(rules::Symbol symbol) { - if (symbol.is_auxiliary()) + if (symbol.is_built_in()) { + if (symbol == rules::ERROR) + return "ts_builtin_sym_error"; + else + return "unexpected_built_in_sym!"; + } else if (symbol.is_auxiliary()) return "ts_aux_sym_" + symbol.name; else return "ts_sym_" + symbol.name; @@ -232,14 +238,16 @@ namespace tree_sitter { string symbol_enum() { string result = "enum {\n"; for (auto symbol : parse_table.symbols) - result += indent(symbol_id(symbol)) + ",\n"; + if (!symbol.is_built_in()) + result += indent(symbol_id(symbol)) + ",\n"; return result + "};"; } string rule_names_list() { string result = "SYMBOL_NAMES {\n"; for (auto symbol : parse_table.symbols) - result += indent(string("\"") + symbol.name) + "\",\n"; + if (!symbol.is_built_in()) + result += indent(string("\"") + symbol.name) + "\",\n"; return result + "};"; } @@ -247,6 +255,35 @@ namespace tree_sitter { return "#include \"tree_sitter/parser.h\""; } + string recover_case(ParseStateId state, set symbols) { + string result = "RECOVER(" + to_string(state) + ", " + to_string(symbols.size()) + ", EXPECT({"; + bool started = false; + for (auto &symbol : symbols) { + if (started) { + result += ", "; + } + result += symbol_id(symbol); + started = true; + } + return result + "}));"; + } + + string recover_function() { + string cases; + for (auto &pair : parse_table.error_table) { + auto pair_for_state = pair.second; + cases += _case(to_string(pair.first), recover_case(pair_for_state.first, pair_for_state.second)); + } + cases += _default(recover_case(0, set())); + + string body = _switch("state", cases); + return join({ + "static const ts_symbol * ts_recover(ts_state state, ts_state *to_state, size_t *count) {", + indent(body), + "}" + }); + } + string lex_function() { return join({ "LEX_FN() {", @@ -276,6 +313,7 @@ namespace tree_sitter { includes(), symbol_enum(), rule_names_list(), + recover_function(), lex_function(), parse_function(), parse_config_struct(), diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 0ffb32d4..05218439 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -67,6 +67,7 @@ namespace tree_sitter { std::vector states; std::set symbols; + std::map>> error_table; }; } diff --git a/src/compiler/rules/built_in_symbols.cpp b/src/compiler/rules/built_in_symbols.cpp new file mode 100644 index 00000000..3d2dbd33 --- /dev/null +++ b/src/compiler/rules/built_in_symbols.cpp @@ -0,0 +1,7 @@ +#include "built_in_symbols.h" + +namespace tree_sitter { + namespace rules { + const Symbol ERROR("error", SymbolTypeBuiltIn); + } +} \ No newline at end of file diff --git a/src/compiler/rules/built_in_symbols.h b/src/compiler/rules/built_in_symbols.h new file mode 100644 index 00000000..4c774ac5 --- /dev/null +++ b/src/compiler/rules/built_in_symbols.h @@ -0,0 +1,12 @@ +#ifndef __tree_sitter__built_in_symbols__ +#define __tree_sitter__built_in_symbols__ + +#include "./symbol.h" + +namespace tree_sitter { + namespace rules { + extern const Symbol ERROR; + } +} + +#endif diff --git a/src/compiler/rules/rules.cpp b/src/compiler/rules/rules.cpp index 55d7222a..f1a363d9 100644 --- a/src/compiler/rules/rules.cpp +++ b/src/compiler/rules/rules.cpp @@ -8,6 +8,7 @@ #include "./pattern.h" #include "./character_set.h" #include "./repeat.h" +#include "./built_in_symbols.h" namespace tree_sitter { using std::make_shared; @@ -47,5 +48,9 @@ namespace tree_sitter { rule_ptr str(const string &value) { return make_shared(value); } + + rule_ptr err(const rule_ptr &rule) { + return choice({ rule, ERROR.copy() }); + } } } diff --git a/src/compiler/rules/symbol.cpp b/src/compiler/rules/symbol.cpp index ec8e05f7..310945be 100644 --- a/src/compiler/rules/symbol.cpp +++ b/src/compiler/rules/symbol.cpp @@ -35,6 +35,8 @@ namespace tree_sitter { return string("#"; case SymbolTypeAuxiliary: return string("#"; + case SymbolTypeBuiltIn: + return string("#"; } } @@ -44,6 +46,10 @@ namespace tree_sitter { return (name < other.name); } + bool Symbol::is_built_in() const { + return type == SymbolTypeBuiltIn; + } + bool Symbol::is_auxiliary() const { return type == SymbolTypeAuxiliary; } diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 7242cfca..cd0c9b72 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -9,7 +9,8 @@ namespace tree_sitter { typedef enum { SymbolTypeNormal, SymbolTypeHidden, - SymbolTypeAuxiliary + SymbolTypeAuxiliary, + SymbolTypeBuiltIn } SymbolType; class Symbol : public Rule { @@ -26,6 +27,7 @@ namespace tree_sitter { void accept(Visitor &visitor) const; bool operator<(const Symbol &other) const; + bool is_built_in() const; bool is_hidden() const; bool is_auxiliary() const; diff --git a/src/runtime/tree.cpp b/src/runtime/tree.cpp index f7e5e010..7cd3574d 100644 --- a/src/runtime/tree.cpp +++ b/src/runtime/tree.cpp @@ -5,8 +5,6 @@ using std::string; using std::to_string; -const ts_symbol ts_symbol_error = -1; - ts_tree * ts_tree_make_leaf(ts_symbol symbol) { ts_tree *result = new ts_tree(); result->ref_count = 0; @@ -33,7 +31,7 @@ ts_tree * ts_tree_make_node(ts_symbol symbol, size_t child_count, ts_tree **chil ts_tree * ts_tree_make_error(char lookahead_char, size_t expected_input_count, const ts_symbol *expected_inputs) { ts_tree *result = new ts_tree(); - result->symbol = ts_symbol_error; + result->symbol = ts_builtin_sym_error; result->data.error = { .lookahead_char = lookahead_char, .expected_input_count = expected_input_count, @@ -58,7 +56,7 @@ void ts_tree_release(ts_tree *tree) { int ts_tree_equals(const ts_tree *node1, const ts_tree *node2) { if (node1->symbol != node2->symbol) return 0; - if (node1->symbol == ts_symbol_error) { + if (node1->symbol == ts_builtin_sym_error) { // check error equality } else { if (node1->data.children.count != node2->data.children.count) @@ -74,18 +72,18 @@ int ts_tree_equals(const ts_tree *node1, const ts_tree *node2) { } ts_tree ** ts_tree_children(const ts_tree *tree) { - if (tree->symbol == ts_symbol_error) return NULL; + if (tree->symbol == ts_builtin_sym_error) return NULL; return tree->data.children.contents; } size_t ts_tree_child_count(const ts_tree *tree) { - if (tree->symbol == ts_symbol_error) return 0; + if (tree->symbol == ts_builtin_sym_error) return 0; return tree->data.children.count; } static string __tree_to_string(const ts_tree *tree, const char **symbol_names) { if (!tree) return "#"; - if (tree->symbol == ts_symbol_error) return "(ERROR)"; + if (tree->symbol == ts_builtin_sym_error) return "(ERROR)"; string result = string("(") + symbol_names[tree->symbol]; for (int i = 0; i < tree->data.children.count; i++) result += " " + __tree_to_string(tree->data.children.contents[i], symbol_names); diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index adcf7e25..15eaaec7 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -14,6 +14,7 @@ 12130614182C3A1700FCF928 /* seq.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130612182C3A1700FCF928 /* seq.cpp */; }; 12130617182C3D2900FCF928 /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130615182C3D2900FCF928 /* string.cpp */; }; 1214930E181E200B008E9BDA /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 121492E9181E200B008E9BDA /* main.cpp */; }; + 122587AF18BDD28B00A68B84 /* built_in_symbols.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 122587AD18BDD28B00A68B84 /* built_in_symbols.cpp */; }; 122587B118BDD79600A68B84 /* follow_sets_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 122587B018BDD79600A68B84 /* follow_sets_spec.cpp */; }; 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */; }; 1236A7C518B287DC00593ABB /* character_range.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1236A7C318B287DC00593ABB /* character_range.cpp */; }; @@ -96,6 +97,8 @@ 121492E9181E200B008E9BDA /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = spec/main.cpp; sourceTree = SOURCE_ROOT; }; 121492EA181E200B008E9BDA /* rules_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rules_spec.cpp; path = spec/compiler/rules/rules_spec.cpp; sourceTree = SOURCE_ROOT; }; 121D8B3018795CC0003CF44B /* parser.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = parser.h; sourceTree = ""; }; + 122587AD18BDD28B00A68B84 /* built_in_symbols.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = built_in_symbols.cpp; sourceTree = ""; }; + 122587AE18BDD28B00A68B84 /* built_in_symbols.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = built_in_symbols.h; sourceTree = ""; }; 122587B018BDD79600A68B84 /* follow_sets_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = follow_sets_spec.cpp; sourceTree = ""; }; 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prepare_grammar_spec.cpp; sourceTree = ""; }; 1236A7C318B287DC00593ABB /* character_range.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = character_range.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; @@ -201,6 +204,8 @@ 12130604182C348F00FCF928 /* character_set.h */, 1213060C182C398300FCF928 /* choice.cpp */, 1213060D182C398300FCF928 /* choice.h */, + 122587AD18BDD28B00A68B84 /* built_in_symbols.cpp */, + 122587AE18BDD28B00A68B84 /* built_in_symbols.h */, 27A340F3EEB184C040521323 /* pattern.cpp */, 27A3438C4FA59A3882E8493B /* pattern.h */, 12D136A2183678A2005F3369 /* repeat.cpp */, @@ -532,6 +537,7 @@ 12EDCFC318820A70005A7A07 /* item_set_transitions.cpp in Sources */, 12FD4064185E75290041A84E /* compile_examples.cpp in Sources */, 12EDCFAF18820387005A7A07 /* parse_table.cpp in Sources */, + 122587AF18BDD28B00A68B84 /* built_in_symbols.cpp in Sources */, 1214930E181E200B008E9BDA /* main.cpp in Sources */, 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */, 12D136A4183678A2005F3369 /* repeat.cpp in Sources */,