From 67fa81d0793b24e59e4d05f52c6896c84dd7f125 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 24 Jan 2014 18:25:56 -0800 Subject: [PATCH] Convert repeat rules into pairs of recursive rules --- spec/compiler/expand_repeats_spec.cpp | 36 ++ .../extract_tokens_spec.cpp} | 8 +- spec/fixtures/parsers/arithmetic.c | 72 ++-- spec/fixtures/parsers/json.c | 318 +++++++++--------- spec/runtime/json_spec.cpp | 3 + src/compiler/build_tables/item.cpp | 2 + .../prepare_grammar/expand_repeats.cpp | 58 ++++ src/compiler/prepare_grammar/expand_repeats.h | 12 + src/compiler/prepare_grammar/perform.cpp | 6 +- tree_sitter.xcodeproj/project.pbxproj | 31 +- 10 files changed, 328 insertions(+), 218 deletions(-) create mode 100644 spec/compiler/expand_repeats_spec.cpp rename spec/compiler/{prepare_grammar_spec.cpp => prepare_grammar/extract_tokens_spec.cpp} (88%) create mode 100644 src/compiler/prepare_grammar/expand_repeats.cpp create mode 100644 src/compiler/prepare_grammar/expand_repeats.h diff --git a/spec/compiler/expand_repeats_spec.cpp b/spec/compiler/expand_repeats_spec.cpp new file mode 100644 index 00000000..9e280e47 --- /dev/null +++ b/spec/compiler/expand_repeats_spec.cpp @@ -0,0 +1,36 @@ +#include "spec_helper.h" +#include "prepare_grammar/expand_repeats.h" + +START_TEST + +using prepare_grammar::expand_repeats; +using namespace rules; + +describe("expanding repeat rules in a grammar", []() { + it("replaces repeat rules with pairs of recursive rules", [&]() { + Grammar result = expand_repeats(Grammar({ + { "rule1", seq({ + sym("x"), + repeat(seq({ sym("a"), sym("b") })), + sym("y") + }) }, + })); + + AssertThat(result, Equals(Grammar({ + { "rule1", seq({ + sym("x"), + sym("repeat_helper1"), + sym("y") + }) }, + { "repeat_helper1", seq({ + seq({ sym("a"), sym("b") }), + choice({ + sym("repeat_helper1") , + blank() + }), + }) } + }))); + }); +}); + +END_TEST \ No newline at end of file diff --git a/spec/compiler/prepare_grammar_spec.cpp b/spec/compiler/prepare_grammar/extract_tokens_spec.cpp similarity index 88% rename from spec/compiler/prepare_grammar_spec.cpp rename to spec/compiler/prepare_grammar/extract_tokens_spec.cpp index cef0ecbc..9631e0b1 100644 --- a/spec/compiler/prepare_grammar_spec.cpp +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cpp @@ -1,14 +1,14 @@ #include "spec_helper.h" -#include "prepare_grammar/perform.h" +#include "prepare_grammar/extract_tokens.h" START_TEST -using prepare_grammar::perform; +using prepare_grammar::extract_tokens; using namespace rules; describe("preparing a grammar", []() { it("extracts character-based subtrees into a separate grammar", [&]() { - pair result = perform(Grammar({ + pair result = extract_tokens(Grammar({ { "rule1", seq({ character('a'), character('b'), @@ -38,7 +38,7 @@ describe("preparing a grammar", []() { }); it("turns entire rules into tokens when they contain no symbols", [&]() { - auto result = perform(Grammar({ + auto result = extract_tokens(Grammar({ { "rule1", sym("rule2") }, { "rule2", seq({ character('a'), diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 20ba5bed..52cbbd93 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -3,12 +3,12 @@ enum ts_symbol { ts_symbol_expression, - ts_symbol_term, ts_symbol_factor, + ts_symbol_term, ts_symbol_times, - ts_symbol_plus, ts_symbol_2, ts_symbol_1, + ts_symbol_plus, ts_symbol_number, ts_symbol___END__, ts_symbol_variable, @@ -16,12 +16,12 @@ enum ts_symbol { static const char *ts_symbol_names[] = { "expression", - "term", "factor", + "term", "times", - "plus", "2", "1", + "plus", "number", "__END__", "variable", @@ -75,44 +75,28 @@ static void ts_lex(TSParser *parser) { case 10: if (isalnum(LOOKAHEAD_CHAR())) ADVANCE(13); - if (LOOKAHEAD_CHAR() == '(') + if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(12); - if (isdigit(LOOKAHEAD_CHAR())) + if (LOOKAHEAD_CHAR() == '(') ADVANCE(11); - LEX_ERROR(3, EXPECT({"", "'('", ""})); + LEX_ERROR(3, EXPECT({"'('", "", ""})); case 11: - if (isdigit(LOOKAHEAD_CHAR())) - ADVANCE(11); - ACCEPT_TOKEN(ts_symbol_number); - case 12: ACCEPT_TOKEN(ts_symbol_1); + case 12: + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(12); + ACCEPT_TOKEN(ts_symbol_number); case 13: if (isalnum(LOOKAHEAD_CHAR())) ADVANCE(13); ACCEPT_TOKEN(ts_symbol_variable); case 14: - if (isalnum(LOOKAHEAD_CHAR())) - ADVANCE(13); - if (LOOKAHEAD_CHAR() == '(') - ADVANCE(12); - if (isdigit(LOOKAHEAD_CHAR())) - ADVANCE(11); - LEX_ERROR(3, EXPECT({"", "'('", ""})); - case 15: - if (LOOKAHEAD_CHAR() == '+') - ADVANCE(8); - if (LOOKAHEAD_CHAR() == '*') - ADVANCE(3); - if (LOOKAHEAD_CHAR() == ')') - ADVANCE(5); - LEX_ERROR(3, EXPECT({"')'", "'*'", "'+'"})); - case 16: if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(2, EXPECT({"''", "'+'"})); - case 17: + case 15: if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); if (LOOKAHEAD_CHAR() == '+') @@ -120,14 +104,6 @@ static void ts_lex(TSParser *parser) { if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(3, EXPECT({"''", "'+'", "'*'"})); - case 18: - if (LOOKAHEAD_CHAR() == '+') - ADVANCE(8); - if (LOOKAHEAD_CHAR() == '*') - ADVANCE(3); - if (LOOKAHEAD_CHAR() == '\0') - ADVANCE(1); - LEX_ERROR(3, EXPECT({"''", "'*'", "'+'"})); default: LEX_PANIC(); } @@ -164,7 +140,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(1, EXPECT({"__END__"})); } case 2: - SET_LEX_STATE(16); + SET_LEX_STATE(14); switch (LOOKAHEAD_SYM()) { case ts_symbol___END__: REDUCE(ts_symbol_expression, 1); @@ -338,7 +314,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(3, EXPECT({"2", "plus", "times"})); } case 16: - SET_LEX_STATE(15); + SET_LEX_STATE(7); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: REDUCE(ts_symbol_term, 1); @@ -350,7 +326,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(3, EXPECT({"times", "2", "plus"})); } case 17: - SET_LEX_STATE(14); + SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(22); @@ -448,7 +424,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(2, EXPECT({"times", "2"})); } case 26: - SET_LEX_STATE(14); + SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(31); @@ -540,7 +516,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(2, EXPECT({"times", "__END__"})); } case 35: - SET_LEX_STATE(14); + SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(40); @@ -604,7 +580,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(1, EXPECT({"__END__"})); } case 41: - SET_LEX_STATE(17); + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_times: REDUCE(ts_symbol_factor, 1); @@ -642,7 +618,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(1, EXPECT({"2"})); } case 44: - SET_LEX_STATE(17); + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_times: REDUCE(ts_symbol_factor, 3); @@ -654,7 +630,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(3, EXPECT({"__END__", "plus", "times"})); } case 45: - SET_LEX_STATE(18); + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: REDUCE(ts_symbol_term, 1); @@ -666,7 +642,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(3, EXPECT({"times", "__END__", "plus"})); } case 46: - SET_LEX_STATE(14); + SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(51); @@ -680,7 +656,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(4, EXPECT({"variable", "number", "1", "factor"})); } case 47: - SET_LEX_STATE(16); + SET_LEX_STATE(14); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: REDUCE(ts_symbol_factor, 1); @@ -716,7 +692,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(1, EXPECT({"2"})); } case 50: - SET_LEX_STATE(16); + SET_LEX_STATE(14); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: REDUCE(ts_symbol_factor, 3); @@ -726,7 +702,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_ERROR(2, EXPECT({"__END__", "plus"})); } case 51: - SET_LEX_STATE(16); + SET_LEX_STATE(14); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: REDUCE(ts_symbol_term, 3); diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index bcb3fce0..898e132f 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -2,8 +2,10 @@ #include enum ts_symbol { + ts_symbol_repeat_helper1, ts_symbol_value, ts_symbol_object, + ts_symbol_repeat_helper2, ts_symbol_array, ts_symbol___END__, ts_symbol_number, @@ -18,8 +20,10 @@ enum ts_symbol { }; static const char *ts_symbol_names[] = { + "repeat_helper1", "value", "object", + "repeat_helper2", "array", "__END__", "number", @@ -47,7 +51,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); ACCEPT_TOKEN(ts_symbol_2); case 3: - LEX_ERROR(0, EXPECT({})); + ACCEPT_TOKEN(ts_symbol_7); case 4: if (LOOKAHEAD_CHAR() == ']') ADVANCE(5); @@ -55,11 +59,11 @@ static void ts_lex(TSParser *parser) { case 5: ACCEPT_TOKEN(ts_symbol_3); case 6: - if (LOOKAHEAD_CHAR() == ',') + if (LOOKAHEAD_CHAR() == '}') ADVANCE(7); - ACCEPT_TOKEN(ts_symbol_6); + LEX_ERROR(1, EXPECT({"'}'"})); case 7: - LEX_ERROR(0, EXPECT({})); + ACCEPT_TOKEN(ts_symbol_6); case 8: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); @@ -91,18 +95,12 @@ static void ts_lex(TSParser *parser) { case 14: ACCEPT_TOKEN(ts_symbol_string); case 15: - if (LOOKAHEAD_CHAR() == '}') - ADVANCE(16); - LEX_ERROR(1, EXPECT({"'}'"})); - case 16: - ACCEPT_TOKEN(ts_symbol_7); - case 17: if (LOOKAHEAD_CHAR() == ':') - ADVANCE(18); + ADVANCE(16); LEX_ERROR(1, EXPECT({"':'"})); - case 18: + case 16: ACCEPT_TOKEN(ts_symbol_5); - case 19: + case 17: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); LEX_ERROR(1, EXPECT({"'\"'"})); @@ -119,15 +117,15 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(34); + SHIFT(31); case ts_symbol_string: - SHIFT(34); + SHIFT(31); case ts_symbol_array: - SHIFT(34); + SHIFT(31); case ts_symbol_object: - SHIFT(34); + SHIFT(31); case ts_symbol_4: - SHIFT(28); + SHIFT(25); case ts_symbol_1: SHIFT(2); case ts_symbol_value: @@ -147,17 +145,17 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(14); + SHIFT(19); case ts_symbol_string: - SHIFT(14); + SHIFT(19); case ts_symbol_array: - SHIFT(14); + SHIFT(19); case ts_symbol_object: - SHIFT(14); + SHIFT(19); case ts_symbol_4: - SHIFT(7); + SHIFT(8); case ts_symbol_value: - SHIFT(25); + SHIFT(22); case ts_symbol_1: SHIFT(3); default: @@ -167,15 +165,15 @@ static TSParseResult ts_parse(const char *input) { SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(14); + SHIFT(19); case ts_symbol_string: - SHIFT(14); + SHIFT(19); case ts_symbol_array: - SHIFT(14); + SHIFT(19); case ts_symbol_object: - SHIFT(14); + SHIFT(19); case ts_symbol_4: - SHIFT(7); + SHIFT(8); case ts_symbol_value: SHIFT(4); case ts_symbol_1: @@ -186,10 +184,14 @@ static TSParseResult ts_parse(const char *input) { case 4: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + SHIFT(7); case ts_symbol_2: SHIFT(5); + case ts_symbol_repeat_helper1: + SHIFT(5); default: - PARSE_ERROR(1, EXPECT({"2"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "2", "7"})); } case 5: SET_LEX_STATE(4); @@ -202,60 +204,62 @@ static TSParseResult ts_parse(const char *input) { case 6: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + REDUCE(ts_symbol_array, 4); case ts_symbol_2: REDUCE(ts_symbol_array, 4); default: - PARSE_ERROR(1, EXPECT({"2"})); + PARSE_ERROR(2, EXPECT({"2", "7"})); } case 7: - SET_LEX_STATE(19); + SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { + case ts_symbol_number: + SHIFT(19); case ts_symbol_string: + SHIFT(19); + case ts_symbol_array: + SHIFT(19); + case ts_symbol_object: + SHIFT(19); + case ts_symbol_4: SHIFT(8); + case ts_symbol_value: + SHIFT(20); + case ts_symbol_1: + SHIFT(3); default: - PARSE_ERROR(1, EXPECT({"string"})); + PARSE_ERROR(7, EXPECT({"1", "value", "object", "array", "4", "string", "number"})); } case 8: SET_LEX_STATE(17); switch (LOOKAHEAD_SYM()) { - case ts_symbol_5: + case ts_symbol_string: SHIFT(9); default: - PARSE_ERROR(1, EXPECT({"5"})); + PARSE_ERROR(1, EXPECT({"string"})); } case 9: - SET_LEX_STATE(8); + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(21); - case ts_symbol_string: - SHIFT(21); - case ts_symbol_array: - SHIFT(21); - case ts_symbol_object: - SHIFT(21); - case ts_symbol_4: - SHIFT(15); - case ts_symbol_value: - SHIFT(22); - case ts_symbol_1: + case ts_symbol_5: SHIFT(10); default: - PARSE_ERROR(7, EXPECT({"1", "value", "object", "array", "4", "string", "number"})); + PARSE_ERROR(1, EXPECT({"5"})); } case 10: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(14); + SHIFT(19); case ts_symbol_string: - SHIFT(14); + SHIFT(19); case ts_symbol_array: - SHIFT(14); + SHIFT(19); case ts_symbol_object: - SHIFT(14); + SHIFT(19); case ts_symbol_4: - SHIFT(7); + SHIFT(8); case ts_symbol_value: SHIFT(11); case ts_symbol_1: @@ -266,144 +270,140 @@ static TSParseResult ts_parse(const char *input) { case 11: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + SHIFT(14); case ts_symbol_2: SHIFT(12); + case ts_symbol_repeat_helper2: + SHIFT(12); default: - PARSE_ERROR(1, EXPECT({"2"})); + PARSE_ERROR(3, EXPECT({"repeat_helper2", "2", "7"})); } case 12: - SET_LEX_STATE(4); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_3: - SHIFT(13); - default: - PARSE_ERROR(1, EXPECT({"3"})); - } - case 13: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_symbol_6: - REDUCE(ts_symbol_array, 4); + SHIFT(13); default: PARSE_ERROR(1, EXPECT({"6"})); } - case 14: + case 13: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + REDUCE(ts_symbol_object, 6); case ts_symbol_2: - REDUCE(ts_symbol_value, 1); + REDUCE(ts_symbol_object, 6); default: - PARSE_ERROR(1, EXPECT({"2"})); + PARSE_ERROR(2, EXPECT({"2", "7"})); } - case 15: - SET_LEX_STATE(19); + case 14: + SET_LEX_STATE(17); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: - SHIFT(16); + SHIFT(15); default: PARSE_ERROR(1, EXPECT({"string"})); } - case 16: - SET_LEX_STATE(17); + case 15: + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_5: - SHIFT(17); + SHIFT(16); default: PARSE_ERROR(1, EXPECT({"5"})); } - case 17: + case 16: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(21); + SHIFT(19); case ts_symbol_string: - SHIFT(21); + SHIFT(19); case ts_symbol_array: - SHIFT(21); + SHIFT(19); case ts_symbol_object: - SHIFT(21); + SHIFT(19); case ts_symbol_4: - SHIFT(15); + SHIFT(8); case ts_symbol_value: - SHIFT(18); + SHIFT(17); case ts_symbol_1: - SHIFT(10); + SHIFT(3); default: PARSE_ERROR(7, EXPECT({"1", "value", "object", "array", "4", "string", "number"})); } + case 17: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + SHIFT(14); + case ts_symbol_2: + SHIFT(18); + case ts_symbol_repeat_helper2: + SHIFT(18); + default: + PARSE_ERROR(3, EXPECT({"repeat_helper2", "2", "7"})); + } case 18: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_symbol_6: - SHIFT(19); + REDUCE(ts_symbol_repeat_helper2, 5); default: PARSE_ERROR(1, EXPECT({"6"})); } case 19: - SET_LEX_STATE(15); + SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_symbol_7: - SHIFT(20); - default: - PARSE_ERROR(1, EXPECT({"7"})); - } - case 20: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_6: - REDUCE(ts_symbol_object, 6); - default: - PARSE_ERROR(1, EXPECT({"6"})); - } - case 21: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_6: + REDUCE(ts_symbol_value, 1); + case ts_symbol_2: REDUCE(ts_symbol_value, 1); default: - PARSE_ERROR(1, EXPECT({"6"})); + PARSE_ERROR(2, EXPECT({"2", "7"})); } - case 22: - SET_LEX_STATE(6); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_6: - SHIFT(23); - default: - PARSE_ERROR(1, EXPECT({"6"})); - } - case 23: - SET_LEX_STATE(15); + case 20: + SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { case ts_symbol_7: - SHIFT(24); - default: - PARSE_ERROR(1, EXPECT({"7"})); - } - case 24: - SET_LEX_STATE(2); - switch (LOOKAHEAD_SYM()) { + SHIFT(7); case ts_symbol_2: - REDUCE(ts_symbol_object, 6); + SHIFT(21); + case ts_symbol_repeat_helper1: + SHIFT(21); default: - PARSE_ERROR(1, EXPECT({"2"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "2", "7"})); } - case 25: - SET_LEX_STATE(2); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_2: - SHIFT(26); - default: - PARSE_ERROR(1, EXPECT({"2"})); - } - case 26: + case 21: SET_LEX_STATE(4); switch (LOOKAHEAD_SYM()) { case ts_symbol_3: - SHIFT(27); + REDUCE(ts_symbol_repeat_helper1, 3); default: PARSE_ERROR(1, EXPECT({"3"})); } - case 27: + case 22: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + SHIFT(7); + case ts_symbol_2: + SHIFT(23); + case ts_symbol_repeat_helper1: + SHIFT(23); + default: + PARSE_ERROR(3, EXPECT({"repeat_helper1", "2", "7"})); + } + case 23: + SET_LEX_STATE(4); + switch (LOOKAHEAD_SYM()) { + case ts_symbol_3: + SHIFT(24); + default: + PARSE_ERROR(1, EXPECT({"3"})); + } + case 24: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_symbol___END__: @@ -411,59 +411,63 @@ static TSParseResult ts_parse(const char *input) { default: PARSE_ERROR(1, EXPECT({"__END__"})); } - case 28: - SET_LEX_STATE(19); + case 25: + SET_LEX_STATE(17); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: - SHIFT(29); + SHIFT(26); default: PARSE_ERROR(1, EXPECT({"string"})); } - case 29: - SET_LEX_STATE(17); + case 26: + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_5: - SHIFT(30); + SHIFT(27); default: PARSE_ERROR(1, EXPECT({"5"})); } - case 30: + case 27: SET_LEX_STATE(8); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: - SHIFT(21); + SHIFT(19); case ts_symbol_string: - SHIFT(21); + SHIFT(19); case ts_symbol_array: - SHIFT(21); + SHIFT(19); case ts_symbol_object: - SHIFT(21); + SHIFT(19); case ts_symbol_4: - SHIFT(15); + SHIFT(8); case ts_symbol_value: - SHIFT(31); + SHIFT(28); case ts_symbol_1: - SHIFT(10); + SHIFT(3); default: PARSE_ERROR(7, EXPECT({"1", "value", "object", "array", "4", "string", "number"})); } - case 31: + case 28: + SET_LEX_STATE(2); + switch (LOOKAHEAD_SYM()) { + case ts_symbol_7: + SHIFT(14); + case ts_symbol_2: + SHIFT(29); + case ts_symbol_repeat_helper2: + SHIFT(29); + default: + PARSE_ERROR(3, EXPECT({"repeat_helper2", "2", "7"})); + } + case 29: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { case ts_symbol_6: - SHIFT(32); + SHIFT(30); default: PARSE_ERROR(1, EXPECT({"6"})); } - case 32: - SET_LEX_STATE(15); - switch (LOOKAHEAD_SYM()) { - case ts_symbol_7: - SHIFT(33); - default: - PARSE_ERROR(1, EXPECT({"7"})); - } - case 33: + case 30: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_symbol___END__: @@ -471,7 +475,7 @@ static TSParseResult ts_parse(const char *input) { default: PARSE_ERROR(1, EXPECT({"__END__"})); } - case 34: + case 31: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_symbol___END__: diff --git a/spec/runtime/json_spec.cpp b/spec/runtime/json_spec.cpp index c844e6f8..8eb810da 100644 --- a/spec/runtime/json_spec.cpp +++ b/spec/runtime/json_spec.cpp @@ -21,6 +21,9 @@ describe("json", []() { it("parses objects", [&]() { TSDocumentSetText(document, "{\"key1\":1}"); AssertThat(string(TSDocumentToString(document)), Equals("(value (object (4) (string) (5) (value (number)) (6) (7)))")); + + TSDocumentSetText(document, "{\"key1\":1,\"key2\":2}"); + AssertThat(string(TSDocumentToString(document)), Equals("(value (object (4) (string) (5) (value (number)) (6) (7)))")); }); }); diff --git a/src/compiler/build_tables/item.cpp b/src/compiler/build_tables/item.cpp index c2a9a309..4ac701a1 100644 --- a/src/compiler/build_tables/item.cpp +++ b/src/compiler/build_tables/item.cpp @@ -40,6 +40,7 @@ namespace tree_sitter { bool LexItem::operator<(const LexItem &other) const { if (rule_name < other.rule_name) return true; + if (rule_name > other.rule_name) return false; if (rule->to_string() < other.rule->to_string()) return true; return false; } @@ -50,6 +51,7 @@ namespace tree_sitter { if (rule->to_string() < other.rule->to_string()) return true; if (rule->to_string() > other.rule->to_string()) return false; if (consumed_sym_count < other.consumed_sym_count) return true; + if (consumed_sym_count > other.consumed_sym_count) return false; if (lookahead_sym_name < other.lookahead_sym_name) return true; return false; } diff --git a/src/compiler/prepare_grammar/expand_repeats.cpp b/src/compiler/prepare_grammar/expand_repeats.cpp new file mode 100644 index 00000000..2405f8a8 --- /dev/null +++ b/src/compiler/prepare_grammar/expand_repeats.cpp @@ -0,0 +1,58 @@ +#include "expand_repeats.h" +#include + +using std::string; +using std::to_string; +using std::unordered_map; +using namespace tree_sitter::rules; + +namespace tree_sitter { + namespace prepare_grammar { + class RepeatExpander : rules::Visitor { + public: + rule_ptr value; + unordered_map aux_rules; + + rule_ptr apply(const rule_ptr rule) { + rule->accept(*this); + return value; + } + + rule_ptr make_repeat_helper(string name, const rule_ptr &rule) { + return seq({ + rule, + choice({ sym(name), blank() }) + }); + } + + void visit(const Repeat *rule) { + rule_ptr inner_rule = apply(rule->content); + string helper_rule_name = string("repeat_helper") + to_string(aux_rules.size() + 1); + aux_rules.insert({ helper_rule_name, make_repeat_helper(helper_rule_name, inner_rule) }); + value = sym(helper_rule_name); + } + + void visit(const Seq *rule) { + value = seq({ apply(rule->left), apply(rule->right) }); + } + + void visit(const Choice *rule) { + value = choice({ apply(rule->left), apply(rule->right) }); + } + + void default_visit(const Rule *rule) { + value = rule->copy(); + } + }; + + Grammar expand_repeats(const Grammar &grammar) { + unordered_map result; + RepeatExpander visitor; + for (auto pair : grammar.rules) + result.insert({ pair.first, visitor.apply(pair.second) }); + for (auto pair : visitor.aux_rules) + result.insert(pair); + return Grammar(grammar.start_rule_name, result); + } + } +} \ No newline at end of file diff --git a/src/compiler/prepare_grammar/expand_repeats.h b/src/compiler/prepare_grammar/expand_repeats.h new file mode 100644 index 00000000..69eef554 --- /dev/null +++ b/src/compiler/prepare_grammar/expand_repeats.h @@ -0,0 +1,12 @@ +#ifndef __tree_sitter__expand_repeats__ +#define __tree_sitter__expand_repeats__ + +#include "grammar.h" + +namespace tree_sitter { + namespace prepare_grammar { + Grammar expand_repeats(const Grammar &); + } +} + +#endif diff --git a/src/compiler/prepare_grammar/perform.cpp b/src/compiler/prepare_grammar/perform.cpp index 6bfd4bc5..9606ab76 100644 --- a/src/compiler/prepare_grammar/perform.cpp +++ b/src/compiler/prepare_grammar/perform.cpp @@ -1,12 +1,14 @@ #include "./perform.h" -#include "extract_tokens.h" +#include "./extract_tokens.h" +#include "./expand_repeats.h" using std::pair; namespace tree_sitter { namespace prepare_grammar { pair perform(const Grammar &input_grammar) { - return prepare_grammar::extract_tokens(input_grammar); + auto rule_grammar = expand_repeats(input_grammar); + return prepare_grammar::extract_tokens(rule_grammar); } } } diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index 78d36b43..21a02c40 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -14,7 +14,7 @@ 12130614182C3A1700FCF928 /* seq.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130612182C3A1700FCF928 /* seq.cpp */; }; 12130617182C3D2900FCF928 /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130615182C3D2900FCF928 /* string.cpp */; }; 1214930E181E200B008E9BDA /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 121492E9181E200B008E9BDA /* main.cpp */; }; - 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */; }; + 1225CC6418765693000D4723 /* extract_tokens_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1225CC6318765693000D4723 /* extract_tokens_spec.cpp */; }; 1251209B1830145300C9B56A /* rule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209A1830145300C9B56A /* rule.cpp */; }; 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; }; 12AB465F188BD03E00DE79DF /* follow_sets.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12AB465D188BD03E00DE79DF /* follow_sets.cpp */; }; @@ -25,6 +25,8 @@ 12E75A971891BD32001B8F10 /* json.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12E75A961891BD32001B8F10 /* json.cpp */; }; 12E75A9A1891BF57001B8F10 /* json.c in Sources */ = {isa = PBXBuildFile; fileRef = 12E75A981891BF3B001B8F10 /* json.c */; }; 12E75A9C1891C17D001B8F10 /* json_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12E75A9B1891C17D001B8F10 /* json_spec.cpp */; }; + 12E75AA218930931001B8F10 /* expand_repeats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12E75AA018930931001B8F10 /* expand_repeats.cpp */; }; + 12E75AA318930982001B8F10 /* expand_repeats_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12E75A9E189308C4001B8F10 /* expand_repeats_spec.cpp */; }; 12EDCF8A187B498C005A7A07 /* tree_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF89187B498C005A7A07 /* tree_spec.cpp */; }; 12EDCF8D187C6282005A7A07 /* document.c in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8C187C6282005A7A07 /* document.c */; }; 12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8F1881FCCA005A7A07 /* extract_tokens.cpp */; }; @@ -95,7 +97,7 @@ 121492E9181E200B008E9BDA /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = spec/main.cpp; sourceTree = SOURCE_ROOT; }; 121492EA181E200B008E9BDA /* rules_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rules_spec.cpp; path = spec/compiler/rules/rules_spec.cpp; sourceTree = SOURCE_ROOT; }; 121D8B3018795CC0003CF44B /* parser.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = parser.h; sourceTree = ""; }; - 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = prepare_grammar_spec.cpp; path = compiler/prepare_grammar_spec.cpp; sourceTree = ""; }; + 1225CC6318765693000D4723 /* extract_tokens_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = extract_tokens_spec.cpp; path = prepare_grammar/extract_tokens_spec.cpp; sourceTree = ""; }; 1251209A1830145300C9B56A /* rule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rule.cpp; sourceTree = ""; }; 125120A218307FFD00C9B56A /* test_grammars.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = test_grammars.h; path = spec/fixtures/grammars/test_grammars.h; sourceTree = SOURCE_ROOT; }; 125120A3183083BD00C9B56A /* arithmetic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arithmetic.cpp; path = spec/fixtures/grammars/arithmetic.cpp; sourceTree = SOURCE_ROOT; }; @@ -115,6 +117,9 @@ 12E75A961891BD32001B8F10 /* json.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = json.cpp; sourceTree = ""; }; 12E75A981891BF3B001B8F10 /* json.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = json.c; sourceTree = ""; }; 12E75A9B1891C17D001B8F10 /* json_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = json_spec.cpp; sourceTree = ""; }; + 12E75A9E189308C4001B8F10 /* expand_repeats_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_repeats_spec.cpp; sourceTree = ""; }; + 12E75AA018930931001B8F10 /* expand_repeats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = expand_repeats.cpp; path = src/compiler/prepare_grammar/expand_repeats.cpp; sourceTree = SOURCE_ROOT; }; + 12E75AA118930931001B8F10 /* expand_repeats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = expand_repeats.h; path = src/compiler/prepare_grammar/expand_repeats.h; sourceTree = SOURCE_ROOT; }; 12EDCF89187B498C005A7A07 /* tree_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tree_spec.cpp; sourceTree = ""; }; 12EDCF8B187C6251005A7A07 /* document.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = document.h; sourceTree = ""; }; 12EDCF8C187C6282005A7A07 /* document.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = document.c; sourceTree = ""; }; @@ -243,8 +248,7 @@ 12BC470618830BC5005AC502 /* first_set_spec.cpp */, 12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */, ); - name = build_tables; - path = compiler/build_tables; + path = build_tables; sourceTree = ""; }; 125120A118307FCA00C9B56A /* grammars */ = { @@ -318,9 +322,20 @@ path = spec; sourceTree = ""; }; + 12E75A9D1892D4F1001B8F10 /* prepare_grammar */ = { + isa = PBXGroup; + children = ( + 1225CC6318765693000D4723 /* extract_tokens_spec.cpp */, + 12E75A9E189308C4001B8F10 /* expand_repeats_spec.cpp */, + ); + name = prepare_grammar; + sourceTree = ""; + }; 12ED72A5186FC6D90089229B /* prepare_grammar */ = { isa = PBXGroup; children = ( + 12E75AA018930931001B8F10 /* expand_repeats.cpp */, + 12E75AA118930931001B8F10 /* expand_repeats.h */, 12EDCF8F1881FCCA005A7A07 /* extract_tokens.cpp */, 12EDCF901881FCCA005A7A07 /* extract_tokens.h */, 12EDCF911881FCCA005A7A07 /* perform.cpp */, @@ -384,12 +399,12 @@ 12FD40AE185EE6610041A84E /* compiler */ = { isa = PBXGroup; children = ( + 12E75A9D1892D4F1001B8F10 /* prepare_grammar */, 1213061C182C854F00FCF928 /* build_tables */, 12FD4063185E75290041A84E /* compile_fixtures.cpp */, - 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */, 12D1369F18357066005F3369 /* rules */, ); - name = compiler; + path = compiler; sourceTree = ""; }; 12FD40AF185EE81D0041A84E /* fixtures */ = { @@ -496,6 +511,7 @@ 12FD40D9185FEEDF0041A84E /* pattern_spec.cpp in Sources */, 12130617182C3D2900FCF928 /* string.cpp in Sources */, 12EDCFC018820880005A7A07 /* item_set_closure.cpp in Sources */, + 12E75AA218930931001B8F10 /* expand_repeats.cpp in Sources */, 12EDCFBD188205BF005A7A07 /* perform_spec.cpp in Sources */, 12EDCFC61882153D005A7A07 /* first_set.cpp in Sources */, 12130611182C3A1100FCF928 /* blank.cpp in Sources */, @@ -516,7 +532,8 @@ 1214930E181E200B008E9BDA /* main.cpp in Sources */, 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */, 12D136A4183678A2005F3369 /* repeat.cpp in Sources */, - 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */, + 12E75AA318930982001B8F10 /* expand_repeats_spec.cpp in Sources */, + 1225CC6418765693000D4723 /* extract_tokens_spec.cpp in Sources */, 12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */, 12FD40F3186641C00041A84E /* char_match.cpp in Sources */, 12EDCFB21882039A005A7A07 /* perform.cpp in Sources */,