From 5c1a0982df1a82838129d977ed521d667b26d770 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 15 Feb 2014 15:43:32 -0800 Subject: [PATCH] Change repeat to mean zero-or-more --- include/parser.h | 4 +- spec/compiler/prepare_grammar_spec.cpp | 8 +- spec/fixtures/grammars/json.cpp | 8 +- spec/fixtures/parsers/json.c | 290 ++++++++++-------- spec/runtime/json_spec.cpp | 15 +- src/compiler/build_tables/first_set.cpp | 8 +- src/compiler/build_tables/follow_sets.cpp | 2 +- src/compiler/build_tables/perform.cpp | 18 ++ .../build_tables/rule_can_be_blank.cpp | 7 + src/compiler/build_tables/rule_can_be_blank.h | 3 + .../prepare_grammar/expand_repeats.cpp | 9 +- todo.md | 2 +- 12 files changed, 219 insertions(+), 155 deletions(-) diff --git a/include/parser.h b/include/parser.h index 4dde9eec..bf351c0f 100644 --- a/include/parser.h +++ b/include/parser.h @@ -9,8 +9,8 @@ extern "C" { #include #include -// #define TS_DEBUG_PARSE -// #define TS_DEBUG_LEX +//#define TS_DEBUG_PARSE +//#define TS_DEBUG_LEX #ifdef TS_DEBUG_LEX #define DEBUG_LEX(...) fprintf(stderr, __VA_ARGS__) diff --git a/spec/compiler/prepare_grammar_spec.cpp b/spec/compiler/prepare_grammar_spec.cpp index e915d391..dfec69af 100644 --- a/spec/compiler/prepare_grammar_spec.cpp +++ b/spec/compiler/prepare_grammar_spec.cpp @@ -105,12 +105,12 @@ describe("preparing a grammar", []() { sym("y") }) }, }, { - { "repeat_helper1", seq({ - seq({ sym("a"), sym("b") }), - choice({ + { "repeat_helper1", choice({ + seq({ + seq({ sym("a"), sym("b") }), aux_sym("repeat_helper1"), - blank(), }), + blank(), }) } }))); }); diff --git a/spec/fixtures/grammars/json.cpp b/spec/fixtures/grammars/json.cpp index 14e000c7..f623f9a0 100644 --- a/spec/fixtures/grammars/json.cpp +++ b/spec/fixtures/grammars/json.cpp @@ -5,12 +5,12 @@ using namespace tree_sitter; using namespace rules; static rule_ptr comma_sep(const rule_ptr &rule) { - return seq({ - rule, - choice({ + return choice({ + seq({ + rule, repeat(seq({ aux_sym("comma"), rule })), - blank(), }), + blank(), }); } diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index 2bbc83a4..14f6614c 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -147,6 +147,24 @@ static void ts_lex(TSParser *parser) { case 20: ACCEPT_TOKEN(ts_aux_colon); case 21: + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(10); + if (LOOKAHEAD_CHAR() == '}') + ADVANCE(4); + LEX_ERROR(2, EXPECT({"\"", "}"})); + case 22: + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(10); + if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') + ADVANCE(16); + if (LOOKAHEAD_CHAR() == '[') + ADVANCE(17); + if (LOOKAHEAD_CHAR() == ']') + ADVANCE(7); + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(18); + LEX_ERROR(5, EXPECT({"\"", "0-9", "[", "]", "{"})); + case 23: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(10); LEX_ERROR(1, EXPECT({"\""})); @@ -200,6 +218,8 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(4); + case ts_aux_right_brace: + SHIFT(43); default: PARSE_PANIC(); } @@ -227,7 +247,7 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_left_brace: SHIFT(13); case ts_aux_left_bracket: - SHIFT(20); + SHIFT(19); default: PARSE_PANIC(); } @@ -249,12 +269,12 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_repeat_helper2: SHIFT(41); case ts_aux_right_brace: - SHIFT(43); + REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({})); default: PARSE_PANIC(); } case 8: - SET_LEX_STATE(21); + SET_LEX_STATE(23); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(9); @@ -285,7 +305,7 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_left_brace: SHIFT(13); case ts_aux_left_bracket: - SHIFT(20); + SHIFT(19); default: PARSE_PANIC(); } @@ -297,7 +317,7 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_repeat_helper2: SHIFT(12); case ts_aux_right_brace: - REDUCE(ts_aux_repeat_helper2, 4, COLLAPSE({1, 0, 1, 0})); + REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({})); default: PARSE_PANIC(); } @@ -314,6 +334,8 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(14); + case ts_aux_right_brace: + SHIFT(40); default: PARSE_PANIC(); } @@ -341,7 +363,7 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_left_brace: SHIFT(13); case ts_aux_left_bracket: - SHIFT(20); + SHIFT(19); default: PARSE_PANIC(); } @@ -353,7 +375,7 @@ static TSParseResult ts_parse(const char *input) { case ts_aux_repeat_helper2: SHIFT(17); case ts_aux_right_brace: - SHIFT(19); + REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({})); default: PARSE_PANIC(); } @@ -376,32 +398,34 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 19: - SET_LEX_STATE(2); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: - REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1})); - case ts_aux_right_brace: - REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1})); + case ts_symbol_array: + SHIFT(20); + case ts_symbol_number: + SHIFT(20); + case ts_symbol_object: + SHIFT(20); + case ts_symbol_string: + SHIFT(20); + case ts_symbol_value: + SHIFT(21); + case ts_aux_left_brace: + SHIFT(25); + case ts_aux_left_bracket: + SHIFT(32); + case ts_aux_right_bracket: + SHIFT(39); default: PARSE_PANIC(); } case 20: - SET_LEX_STATE(9); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_symbol_array: - SHIFT(21); - case ts_symbol_number: - SHIFT(21); - case ts_symbol_object: - SHIFT(21); - case ts_symbol_string: - SHIFT(21); - case ts_symbol_value: - SHIFT(22); - case ts_aux_left_brace: - SHIFT(26); - case ts_aux_left_bracket: - SHIFT(33); + case ts_aux_comma: + REDUCE(ts_symbol_value, 1, COLLAPSE({0})); + case ts_aux_right_bracket: + REDUCE(ts_symbol_value, 1, COLLAPSE({0})); default: PARSE_PANIC(); } @@ -409,57 +433,47 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - REDUCE(ts_symbol_value, 1, COLLAPSE({0})); + SHIFT(22); + case ts_aux_repeat_helper1: + SHIFT(37); case ts_aux_right_bracket: - REDUCE(ts_symbol_value, 1, COLLAPSE({0})); + REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({})); default: PARSE_PANIC(); } case 22: - SET_LEX_STATE(6); + SET_LEX_STATE(9); switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: + case ts_symbol_array: + SHIFT(20); + case ts_symbol_number: + SHIFT(20); + case ts_symbol_object: + SHIFT(20); + case ts_symbol_string: + SHIFT(20); + case ts_symbol_value: SHIFT(23); - case ts_aux_repeat_helper1: - SHIFT(38); - case ts_aux_right_bracket: - SHIFT(40); + case ts_aux_left_brace: + SHIFT(25); + case ts_aux_left_bracket: + SHIFT(32); default: PARSE_PANIC(); } case 23: - SET_LEX_STATE(9); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_symbol_array: - SHIFT(21); - case ts_symbol_number: - SHIFT(21); - case ts_symbol_object: - SHIFT(21); - case ts_symbol_string: - SHIFT(21); - case ts_symbol_value: + case ts_aux_comma: + SHIFT(22); + case ts_aux_repeat_helper1: SHIFT(24); - case ts_aux_left_brace: - SHIFT(26); - case ts_aux_left_bracket: - SHIFT(33); + case ts_aux_right_bracket: + REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({})); default: PARSE_PANIC(); } case 24: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: - SHIFT(23); - case ts_aux_repeat_helper1: - SHIFT(25); - case ts_aux_right_bracket: - REDUCE(ts_aux_repeat_helper1, 2, COLLAPSE({1, 0})); - default: - PARSE_PANIC(); - } - case 25: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_aux_right_bracket: @@ -467,23 +481,25 @@ static TSParseResult ts_parse(const char *input) { default: PARSE_PANIC(); } - case 26: + case 25: SET_LEX_STATE(21); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: + SHIFT(26); + case ts_aux_right_brace: + SHIFT(31); + default: + PARSE_PANIC(); + } + case 26: + SET_LEX_STATE(19); + switch (LOOKAHEAD_SYM()) { + case ts_aux_colon: SHIFT(27); default: PARSE_PANIC(); } case 27: - SET_LEX_STATE(19); - switch (LOOKAHEAD_SYM()) { - case ts_aux_colon: - SHIFT(28); - default: - PARSE_PANIC(); - } - case 28: SET_LEX_STATE(9); switch (LOOKAHEAD_SYM()) { case ts_symbol_array: @@ -495,31 +511,41 @@ static TSParseResult ts_parse(const char *input) { case ts_symbol_string: SHIFT(6); case ts_symbol_value: - SHIFT(29); + SHIFT(28); case ts_aux_left_brace: SHIFT(13); case ts_aux_left_bracket: - SHIFT(20); + SHIFT(19); default: PARSE_PANIC(); } - case 29: + case 28: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: SHIFT(8); case ts_aux_repeat_helper2: - SHIFT(30); + SHIFT(29); case ts_aux_right_brace: - SHIFT(32); + REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({})); + default: + PARSE_PANIC(); + } + case 29: + SET_LEX_STATE(5); + switch (LOOKAHEAD_SYM()) { + case ts_aux_right_brace: + SHIFT(30); default: PARSE_PANIC(); } case 30: - SET_LEX_STATE(5); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_aux_right_brace: - SHIFT(31); + case ts_aux_comma: + REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1})); + case ts_aux_right_bracket: + REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1})); default: PARSE_PANIC(); } @@ -527,59 +553,61 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1})); + REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1})); case ts_aux_right_bracket: - REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1})); + REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } case 32: - SET_LEX_STATE(6); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: - REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1})); + case ts_symbol_array: + SHIFT(20); + case ts_symbol_number: + SHIFT(20); + case ts_symbol_object: + SHIFT(20); + case ts_symbol_string: + SHIFT(20); + case ts_symbol_value: + SHIFT(33); + case ts_aux_left_brace: + SHIFT(25); + case ts_aux_left_bracket: + SHIFT(32); case ts_aux_right_bracket: - REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1})); + SHIFT(36); default: PARSE_PANIC(); } case 33: - SET_LEX_STATE(9); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_symbol_array: - SHIFT(21); - case ts_symbol_number: - SHIFT(21); - case ts_symbol_object: - SHIFT(21); - case ts_symbol_string: - SHIFT(21); - case ts_symbol_value: + case ts_aux_comma: + SHIFT(22); + case ts_aux_repeat_helper1: SHIFT(34); - case ts_aux_left_brace: - SHIFT(26); - case ts_aux_left_bracket: - SHIFT(33); + case ts_aux_right_bracket: + REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({})); default: PARSE_PANIC(); } case 34: - SET_LEX_STATE(6); + SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: - SHIFT(23); - case ts_aux_repeat_helper1: - SHIFT(35); case ts_aux_right_bracket: - SHIFT(37); + SHIFT(35); default: PARSE_PANIC(); } case 35: - SET_LEX_STATE(8); + SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { + case ts_aux_comma: + REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); case ts_aux_right_bracket: - SHIFT(36); + REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); default: PARSE_PANIC(); } @@ -587,27 +615,27 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); + REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1})); case ts_aux_right_bracket: - REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); + REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } case 37: - SET_LEX_STATE(6); + SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { - case ts_aux_comma: - REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1})); case ts_aux_right_bracket: - REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1})); + SHIFT(38); default: PARSE_PANIC(); } case 38: - SET_LEX_STATE(8); + SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { - case ts_aux_right_bracket: - SHIFT(39); + case ts_aux_comma: + REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); + case ts_aux_right_brace: + REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); default: PARSE_PANIC(); } @@ -615,9 +643,9 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); + REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1})); case ts_aux_right_brace: - REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1})); + REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } @@ -625,9 +653,9 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1})); + REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1})); case ts_aux_right_brace: - REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1})); + REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } @@ -651,27 +679,29 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_end: - REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1})); + REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } case 44: - SET_LEX_STATE(9); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_array: - SHIFT(21); + SHIFT(20); case ts_symbol_number: - SHIFT(21); + SHIFT(20); case ts_symbol_object: - SHIFT(21); + SHIFT(20); case ts_symbol_string: - SHIFT(21); + SHIFT(20); case ts_symbol_value: SHIFT(45); case ts_aux_left_brace: - SHIFT(26); + SHIFT(25); case ts_aux_left_bracket: - SHIFT(33); + SHIFT(32); + case ts_aux_right_bracket: + SHIFT(48); default: PARSE_PANIC(); } @@ -679,11 +709,11 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_aux_comma: - SHIFT(23); + SHIFT(22); case ts_aux_repeat_helper1: SHIFT(46); case ts_aux_right_bracket: - SHIFT(48); + REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({})); default: PARSE_PANIC(); } @@ -707,7 +737,7 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_aux_end: - REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1})); + REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1})); default: PARSE_PANIC(); } diff --git a/spec/runtime/json_spec.cpp b/spec/runtime/json_spec.cpp index 67c5b15a..c6d72ae2 100644 --- a/spec/runtime/json_spec.cpp +++ b/spec/runtime/json_spec.cpp @@ -25,14 +25,23 @@ describe("json", []() { }); it("parses objects", [&]() { - TSDocumentSetText(document, "{\"key1\":1,\"key2\":2}"); - AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number)) (string) (value (number))))")); - + TSDocumentSetText(document, "{}"); + AssertThat(string(TSDocumentToString(document)), Equals("(value (object))")); + TSDocumentSetText(document, "{\"key1\":1}"); AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number))))")); + + TSDocumentSetText(document, "{\"key1\":1,\"key2\":2}"); + AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number)) (string) (value (number))))")); }); it("parses arrays", [&]() { + TSDocumentSetText(document, "[]"); + AssertThat(string(TSDocumentToString(document)), Equals("(value (array))")); + + TSDocumentSetText(document, "[5]"); + AssertThat(string(TSDocumentToString(document)), Equals("(value (array (value (number))))")); + TSDocumentSetText(document, "[1,2,3]"); AssertThat(string(TSDocumentToString(document)), Equals("(value (array (value (number)) (value (number)) (value (number))))")); }); diff --git a/src/compiler/build_tables/first_set.cpp b/src/compiler/build_tables/first_set.cpp index 850786e2..a756e5d4 100644 --- a/src/compiler/build_tables/first_set.cpp +++ b/src/compiler/build_tables/first_set.cpp @@ -32,15 +32,9 @@ namespace tree_sitter { value = set_union(apply(rule->left, grammar), apply(rule->right, grammar)); } - bool can_be_blank(const rule_ptr &rule) { - if (rule_can_be_blank(rule)) return true; - auto symbol = std::dynamic_pointer_cast(rule); - return (symbol.get() && grammar.has_definition(*symbol) && rule_can_be_blank(grammar.rule(*symbol))); - } - void visit(const Seq *rule) { value = apply(rule->left, grammar); - if (can_be_blank(rule->left)) { + if (rule_can_be_blank(rule->left, grammar)) { value = set_union(value, apply(rule->right, grammar)); } } diff --git a/src/compiler/build_tables/follow_sets.cpp b/src/compiler/build_tables/follow_sets.cpp index 9db5b352..a32dba01 100644 --- a/src/compiler/build_tables/follow_sets.cpp +++ b/src/compiler/build_tables/follow_sets.cpp @@ -21,7 +21,7 @@ namespace tree_sitter { rule_ptr next_rule = pair.second; if (grammar.has_definition(symbol)) { set following_non_terminals = first_set(next_rule, grammar); - if (rule_can_be_blank(next_rule)) + if (rule_can_be_blank(next_rule, grammar)) following_non_terminals.insert(item.lookahead_sym); result.insert({ symbol, following_non_terminals }); } diff --git a/src/compiler/build_tables/perform.cpp b/src/compiler/build_tables/perform.cpp index 65314d95..c483d80c 100644 --- a/src/compiler/build_tables/perform.cpp +++ b/src/compiler/build_tables/perform.cpp @@ -5,6 +5,8 @@ #include "rules.h" #include "grammar.h" +#include "stream_methods.h" + namespace tree_sitter { using std::pair; using std::string; @@ -109,6 +111,22 @@ namespace tree_sitter { return state_index; } + // TODO - remove + void dump_item_sets() { + std::vector item_sets(parse_state_indices.size()); + for (auto &pair : parse_state_indices) + item_sets[pair.second] = &pair.first; + + for (int i = 0; i < item_sets.size(); i++) { + std:cout << "\n\n" << i; + for (auto &item : *item_sets[i]) { + cout << "\n" << item.lhs; + cout << "\n " << item.rule; + cout << "\n " << item.lookahead_sym.name; + } + } + } + public: TableBuilder(const Grammar &grammar, const Grammar &lex_grammar) : diff --git a/src/compiler/build_tables/rule_can_be_blank.cpp b/src/compiler/build_tables/rule_can_be_blank.cpp index 6cf0f7cc..2e636cca 100644 --- a/src/compiler/build_tables/rule_can_be_blank.cpp +++ b/src/compiler/build_tables/rule_can_be_blank.cpp @@ -1,4 +1,5 @@ #include "rule_can_be_blank.h" +#include "grammar.h" #include "rules.h" namespace tree_sitter { @@ -35,5 +36,11 @@ namespace tree_sitter { rule->accept(visitor); return visitor.value; } + + bool rule_can_be_blank(const rule_ptr &rule, const Grammar &grammar) { + if (rule_can_be_blank(rule)) return true; + auto symbol = std::dynamic_pointer_cast(rule); + return (symbol.get() && grammar.has_definition(*symbol) && rule_can_be_blank(grammar.rule(*symbol), grammar)); + } } } diff --git a/src/compiler/build_tables/rule_can_be_blank.h b/src/compiler/build_tables/rule_can_be_blank.h index 5d235ee9..ab0a2579 100644 --- a/src/compiler/build_tables/rule_can_be_blank.h +++ b/src/compiler/build_tables/rule_can_be_blank.h @@ -4,8 +4,11 @@ #include "rule.h" namespace tree_sitter { + class Grammar; + namespace build_tables { bool rule_can_be_blank(const rules::rule_ptr &rule); + bool rule_can_be_blank(const rules::rule_ptr &rule, const Grammar &grammar); } } diff --git a/src/compiler/prepare_grammar/expand_repeats.cpp b/src/compiler/prepare_grammar/expand_repeats.cpp index 53db30de..5f297050 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cpp +++ b/src/compiler/prepare_grammar/expand_repeats.cpp @@ -19,9 +19,12 @@ namespace tree_sitter { } rule_ptr make_repeat_helper(string name, const rule_ptr &rule) { - return seq({ - rule, - choice({ aux_sym(name), blank() }) + return choice({ + seq({ + rule, + aux_sym(name), + }), + blank(), }); } diff --git a/todo.md b/todo.md index 5ac62183..a7900550 100644 --- a/todo.md +++ b/todo.md @@ -2,8 +2,8 @@ TODO ==== ## correct batch parsing +- allow spaces between symbols by default - add comments to generated C code giving an example string for each token -- change the meaning of 'repeat' from 1-or-more to 0-or-more - fix any memory leaks - add special lexical behavior for indentation-aware languages