diff --git a/spec/compiler/build_tables/rule_transitions_spec.cpp b/spec/compiler/build_tables/rule_transitions_spec.cpp index 66f9b936..e977d401 100644 --- a/spec/compiler/build_tables/rule_transitions_spec.cpp +++ b/spec/compiler/build_tables/rule_transitions_spec.cpp @@ -30,6 +30,22 @@ describe("rule transitions", []() { }))); }); + it("handles choices between overlapping character sets", [&]() { + AssertThat( + char_transitions(choice({ + seq({ + character({ {'a', 's'} }), + sym("x") }), + seq({ + character({ { 'm', 'z' } }), + sym("y") }) })), + Equals(transition_map({ + { character({ {'a','l'} }), sym("x") }, + { character({ {'m','s'} }), choice({ sym("x"), sym("y") }) }, + { character({ {'t','z'} }), sym("y") }, + }))); + }); + it("handles sequences", [&]() { AssertThat( sym_transitions(seq({ symbol1, symbol2 })), diff --git a/spec/fixtures/grammars/json.cpp b/spec/fixtures/grammars/json.cpp index 549734d7..5efc608c 100644 --- a/spec/fixtures/grammars/json.cpp +++ b/spec/fixtures/grammars/json.cpp @@ -33,7 +33,13 @@ namespace test_grammars { str("["), comma_sep(sym("value")), str("]"), }) }, - { "string", pattern("\"[^\"]+\"") }, + { "string", seq({ + character('"'), + repeat(choice({ + pattern("[^\"]"), + str("\\\""), + })), + character('"') }) }, { "number", pattern("\\d+") } }); } diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 542aafd7..40c991d2 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -2,28 +2,28 @@ #include enum ts_symbol { - ts_symbol_factor, - ts_aux_token1, ts_symbol_plus, - ts_aux_token2, + ts_symbol_factor, + ts_symbol_variable, + ts_symbol_term, + ts_symbol_expression, + ts_aux_token1, ts_symbol_number, ts_symbol_times, - ts_symbol_term, - ts_symbol_variable, - ts_symbol_expression, + ts_aux_token2, ts_symbol___END__, }; static const char *ts_symbol_names[] = { - "factor", - "token1", "plus", - "token2", + "factor", + "variable", + "term", + "expression", + "token1", "number", "times", - "term", - "variable", - "expression", + "token2", "__END__", }; @@ -31,58 +31,58 @@ static void ts_lex(TSParser *parser) { START_LEXER(); switch (LEX_STATE()) { case 0: - if ((LOOKAHEAD_CHAR() == '\0')) + if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(1, EXPECT({""})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: - if ((LOOKAHEAD_CHAR() == '*')) + if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); - if ((LOOKAHEAD_CHAR() == '\0')) + if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(2, EXPECT({"", "*"})); case 3: ACCEPT_TOKEN(ts_symbol_times); case 4: - if ((LOOKAHEAD_CHAR() == ')')) + if (LOOKAHEAD_CHAR() == ')') ADVANCE(5); LEX_ERROR(1, EXPECT({")"})); case 5: ACCEPT_TOKEN(ts_aux_token2); case 6: - if ((LOOKAHEAD_CHAR() == ')')) + if (LOOKAHEAD_CHAR() == ')') ADVANCE(5); - if ((LOOKAHEAD_CHAR() == '*')) + if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); LEX_ERROR(1, EXPECT({")-*"})); case 7: - if ((LOOKAHEAD_CHAR() == ')')) + if (LOOKAHEAD_CHAR() == ')') ADVANCE(5); - if ((LOOKAHEAD_CHAR() == '*')) + if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); - if ((LOOKAHEAD_CHAR() == '+')) + if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); LEX_ERROR(1, EXPECT({")-+"})); case 8: ACCEPT_TOKEN(ts_symbol_plus); case 9: - if ((LOOKAHEAD_CHAR() == ')')) + if (LOOKAHEAD_CHAR() == ')') ADVANCE(5); - if ((LOOKAHEAD_CHAR() == '+')) + if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); LEX_ERROR(2, EXPECT({")", "+"})); case 10: if (('A' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'Z') || ('a' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'z')) ADVANCE(13); - if ((LOOKAHEAD_CHAR() == '(')) + if (LOOKAHEAD_CHAR() == '(') ADVANCE(12); - if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) + if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') ADVANCE(11); LEX_ERROR(4, EXPECT({"(", "0-9", "A-Z", "a-z"})); case 11: - if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) + if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') ADVANCE(11); ACCEPT_TOKEN(ts_symbol_number); case 12: @@ -93,17 +93,17 @@ static void ts_lex(TSParser *parser) { ADVANCE(13); ACCEPT_TOKEN(ts_symbol_variable); case 14: - if ((LOOKAHEAD_CHAR() == '+')) + if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); - if ((LOOKAHEAD_CHAR() == '\0')) + if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(2, EXPECT({"", "+"})); case 15: - if ((LOOKAHEAD_CHAR() == '*')) + if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); - if ((LOOKAHEAD_CHAR() == '+')) + if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); - if ((LOOKAHEAD_CHAR() == '\0')) + if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(2, EXPECT({"", "*-+"})); default: @@ -118,16 +118,16 @@ static TSParseResult ts_parse(const char *input) { case 0: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(45); case ts_aux_token1: SHIFT(42); case ts_symbol_number: SHIFT(41); - case ts_symbol_term: - SHIFT(2); + case ts_symbol_factor: + SHIFT(45); case ts_symbol_variable: SHIFT(41); + case ts_symbol_term: + SHIFT(2); case ts_symbol_expression: SHIFT(1); default: @@ -188,14 +188,14 @@ static TSParseResult ts_parse(const char *input) { case 6: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(32); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -250,14 +250,14 @@ static TSParseResult ts_parse(const char *input) { case 11: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(23); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -280,14 +280,14 @@ static TSParseResult ts_parse(const char *input) { case 13: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(14); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -354,14 +354,14 @@ static TSParseResult ts_parse(const char *input) { case 19: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(20); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -450,14 +450,14 @@ static TSParseResult ts_parse(const char *input) { case 28: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(29); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -542,14 +542,14 @@ static TSParseResult ts_parse(const char *input) { case 37: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(38); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -584,10 +584,10 @@ static TSParseResult ts_parse(const char *input) { case 41: SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { - case ts_symbol_times: - REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); case ts_symbol_plus: REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); + case ts_symbol_times: + REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); case ts_symbol___END__: REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); default: @@ -596,14 +596,14 @@ static TSParseResult ts_parse(const char *input) { case 42: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(43); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: @@ -622,10 +622,10 @@ static TSParseResult ts_parse(const char *input) { case 44: SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { - case ts_symbol_times: - REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); case ts_symbol_plus: REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); + case ts_symbol_times: + REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); case ts_symbol___END__: REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); default: @@ -634,10 +634,10 @@ static TSParseResult ts_parse(const char *input) { case 45: SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { - case ts_symbol___END__: - REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_symbol_plus: REDUCE(ts_symbol_term, 1, COLLAPSE({0})); + case ts_symbol___END__: + REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_symbol_times: SHIFT(46); default: @@ -670,14 +670,14 @@ static TSParseResult ts_parse(const char *input) { case 48: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); case ts_aux_token1: SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_expression: SHIFT(49); + case ts_symbol_factor: + SHIFT(16); case ts_symbol_variable: SHIFT(12); case ts_symbol_term: diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index 2ba3939d..76e35a26 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -4,116 +4,154 @@ enum ts_symbol { ts_aux_token6, ts_symbol_number, - ts_symbol_object, - ts_aux_token5, - ts_aux_token7, - ts_aux_token4, - ts_aux_repeat_helper2, - ts_aux_token1, - ts_aux_token3, - ts_symbol_value, ts_symbol_string, - ts_aux_token2, + ts_aux_token3, + ts_aux_token5, ts_symbol_array, ts_aux_repeat_helper1, + ts_aux_token7, + ts_aux_token4, ts_symbol___END__, + ts_aux_token2, + ts_aux_repeat_helper2, + ts_aux_token1, + ts_symbol_object, + ts_symbol_value, }; static const char *ts_symbol_names[] = { "token6", "number", - "object", - "token5", - "token7", - "token4", - "repeat_helper2", - "token1", - "token3", - "value", "string", - "token2", + "token3", + "token5", "array", "repeat_helper1", + "token7", + "token4", "__END__", + "token2", + "repeat_helper2", + "token1", + "object", + "value", }; static void ts_lex(TSParser *parser) { START_LEXER(); switch (LEX_STATE()) { case 0: - if ((LOOKAHEAD_CHAR() == '\0')) + if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); LEX_ERROR(1, EXPECT({""})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: - if ((LOOKAHEAD_CHAR() == ',')) + if (LOOKAHEAD_CHAR() == ',') ADVANCE(3); ACCEPT_TOKEN(ts_aux_token3); case 3: ACCEPT_TOKEN(ts_aux_token2); case 4: - if ((LOOKAHEAD_CHAR() == ']')) + if (LOOKAHEAD_CHAR() == ']') ADVANCE(5); LEX_ERROR(1, EXPECT({"]"})); case 5: ACCEPT_TOKEN(ts_aux_token4); case 6: - if ((LOOKAHEAD_CHAR() == ']')) + if (LOOKAHEAD_CHAR() == ']') ADVANCE(5); - if ((LOOKAHEAD_CHAR() == ',')) + if (LOOKAHEAD_CHAR() == ',') ADVANCE(3); LEX_ERROR(2, EXPECT({",", "]"})); case 7: - if ((LOOKAHEAD_CHAR() == '}')) + if (LOOKAHEAD_CHAR() == '}') ADVANCE(8); LEX_ERROR(1, EXPECT({"}"})); case 8: ACCEPT_TOKEN(ts_aux_token7); case 9: - if ((LOOKAHEAD_CHAR() == '}')) + if (LOOKAHEAD_CHAR() == '}') ADVANCE(8); - if ((LOOKAHEAD_CHAR() == ',')) + if (LOOKAHEAD_CHAR() == ',') ADVANCE(3); LEX_ERROR(2, EXPECT({",", "}"})); case 10: - if ((LOOKAHEAD_CHAR() == '{')) - ADVANCE(16); - if ((LOOKAHEAD_CHAR() == '[')) - ADVANCE(15); - if ((LOOKAHEAD_CHAR() == '\"')) + if (LOOKAHEAD_CHAR() == '[') + ADVANCE(18); + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(19); + if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); - if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) + if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') ADVANCE(11); LEX_ERROR(4, EXPECT({"\"", "0-9", "[", "{"})); case 11: - if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) + if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') ADVANCE(11); ACCEPT_TOKEN(ts_symbol_number); case 12: - if (!((LOOKAHEAD_CHAR() == '\"'))) + if (LOOKAHEAD_CHAR() == '\\') + ADVANCE(14); + if (']' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '\\') + ADVANCE(15); + if (!((LOOKAHEAD_CHAR() == '\"') || + (LOOKAHEAD_CHAR() == '\\'))) ADVANCE(13); LEX_ERROR(2, EXPECT({"-!", "#-"})); case 13: - if ((LOOKAHEAD_CHAR() == '\"')) + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(17); + if (LOOKAHEAD_CHAR() == '\\') ADVANCE(14); - if (!((LOOKAHEAD_CHAR() == '\"'))) + if (']' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '\\') + ADVANCE(15); + if (!((LOOKAHEAD_CHAR() == '\"') || + (LOOKAHEAD_CHAR() == '\\'))) ADVANCE(13); LEX_ERROR(1, EXPECT({""})); case 14: - ACCEPT_TOKEN(ts_symbol_string); + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(16); + if (LOOKAHEAD_CHAR() == '\\') + ADVANCE(14); + if (']' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '\\') + ADVANCE(15); + if (!((LOOKAHEAD_CHAR() == '\"') || + (LOOKAHEAD_CHAR() == '\\'))) + ADVANCE(13); + if ('#' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '\"') + ADVANCE(13); + LEX_ERROR(2, EXPECT({"", "#-\""})); case 15: - ACCEPT_TOKEN(ts_aux_token1); + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(13); + LEX_ERROR(1, EXPECT({"\""})); case 16: - ACCEPT_TOKEN(ts_aux_token5); + if (LOOKAHEAD_CHAR() == '\"') + ADVANCE(17); + if (LOOKAHEAD_CHAR() == '\\') + ADVANCE(14); + if (']' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '\\') + ADVANCE(15); + if (!((LOOKAHEAD_CHAR() == '\"') || + (LOOKAHEAD_CHAR() == '\\'))) + ADVANCE(13); + ACCEPT_TOKEN(ts_symbol_string); case 17: - if ((LOOKAHEAD_CHAR() == ':')) - ADVANCE(18); - LEX_ERROR(1, EXPECT({":"})); + ACCEPT_TOKEN(ts_symbol_string); case 18: - ACCEPT_TOKEN(ts_aux_token6); + ACCEPT_TOKEN(ts_aux_token1); case 19: - if ((LOOKAHEAD_CHAR() == '\"')) + ACCEPT_TOKEN(ts_aux_token5); + case 20: + if (LOOKAHEAD_CHAR() == ':') + ADVANCE(21); + LEX_ERROR(1, EXPECT({":"})); + case 21: + ACCEPT_TOKEN(ts_aux_token6); + case 22: + if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); LEX_ERROR(1, EXPECT({"\""})); default: @@ -132,16 +170,16 @@ static TSParseResult ts_parse(const char *input) { SHIFT(53); case ts_symbol_array: SHIFT(53); + case ts_symbol_value: + SHIFT(1); case ts_symbol_object: SHIFT(53); - case ts_symbol_number: - SHIFT(53); case ts_aux_token5: SHIFT(47); + case ts_symbol_number: + SHIFT(53); case ts_aux_token1: SHIFT(2); - case ts_symbol_value: - SHIFT(1); default: PARSE_PANIC(); } @@ -160,14 +198,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); + case ts_symbol_value: + SHIFT(44); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); case ts_aux_token5: SHIFT(12); - case ts_symbol_value: - SHIFT(44); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -180,14 +218,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); + case ts_symbol_value: + SHIFT(4); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); case ts_aux_token5: SHIFT(12); - case ts_symbol_value: - SHIFT(4); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -230,14 +268,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(43); case ts_symbol_array: SHIFT(43); - case ts_symbol_value: - SHIFT(41); case ts_symbol_object: SHIFT(43); - case ts_symbol_number: - SHIFT(43); + case ts_symbol_value: + SHIFT(41); case ts_aux_token5: SHIFT(35); + case ts_symbol_number: + SHIFT(43); case ts_aux_token1: SHIFT(8); default: @@ -250,14 +288,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); + case ts_symbol_value: + SHIFT(9); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); case ts_aux_token5: SHIFT(12); - case ts_symbol_value: - SHIFT(9); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -294,7 +332,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 12: - SET_LEX_STATE(19); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(13); @@ -302,7 +340,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 13: - SET_LEX_STATE(17); + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_aux_token6: SHIFT(14); @@ -316,14 +354,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); - case ts_symbol_value: - SHIFT(15); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); + case ts_symbol_value: + SHIFT(15); case ts_aux_token5: SHIFT(12); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -360,7 +398,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 18: - SET_LEX_STATE(19); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(19); @@ -368,7 +406,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 19: - SET_LEX_STATE(17); + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_aux_token6: SHIFT(20); @@ -382,14 +420,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(34); case ts_symbol_array: SHIFT(34); - case ts_symbol_value: - SHIFT(32); case ts_symbol_object: SHIFT(34); - case ts_symbol_number: - SHIFT(34); + case ts_symbol_value: + SHIFT(32); case ts_aux_token5: SHIFT(26); + case ts_symbol_number: + SHIFT(34); case ts_aux_token1: SHIFT(21); default: @@ -402,14 +440,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); + case ts_symbol_value: + SHIFT(22); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); case ts_aux_token5: SHIFT(12); - case ts_symbol_value: - SHIFT(22); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -456,7 +494,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 26: - SET_LEX_STATE(19); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(27); @@ -464,7 +502,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 27: - SET_LEX_STATE(17); + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_aux_token6: SHIFT(28); @@ -478,14 +516,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); - case ts_symbol_value: - SHIFT(29); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); + case ts_symbol_value: + SHIFT(29); case ts_aux_token5: SHIFT(12); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -524,10 +562,10 @@ static TSParseResult ts_parse(const char *input) { case 32: SET_LEX_STATE(9); switch (LOOKAHEAD_SYM()) { - case ts_aux_token2: - SHIFT(18); case ts_aux_token7: REDUCE(ts_aux_repeat_helper1, 4, COLLAPSE({1, 0, 1, 0})); + case ts_aux_token2: + SHIFT(18); case ts_aux_repeat_helper1: SHIFT(33); default: @@ -552,7 +590,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 35: - SET_LEX_STATE(19); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(36); @@ -560,7 +598,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 36: - SET_LEX_STATE(17); + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_aux_token6: SHIFT(37); @@ -574,14 +612,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); - case ts_symbol_value: - SHIFT(38); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); + case ts_symbol_value: + SHIFT(38); case ts_aux_token5: SHIFT(12); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: @@ -620,10 +658,10 @@ static TSParseResult ts_parse(const char *input) { case 41: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_aux_token4: - REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_token2: SHIFT(7); + case ts_aux_token4: + REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_repeat_helper2: SHIFT(42); default: @@ -676,7 +714,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 47: - SET_LEX_STATE(19); + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { case ts_symbol_string: SHIFT(48); @@ -684,7 +722,7 @@ static TSParseResult ts_parse(const char *input) { PARSE_PANIC(); } case 48: - SET_LEX_STATE(17); + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_aux_token6: SHIFT(49); @@ -698,14 +736,14 @@ static TSParseResult ts_parse(const char *input) { SHIFT(25); case ts_symbol_array: SHIFT(25); - case ts_symbol_value: - SHIFT(50); case ts_symbol_object: SHIFT(25); - case ts_symbol_number: - SHIFT(25); + case ts_symbol_value: + SHIFT(50); case ts_aux_token5: SHIFT(12); + case ts_symbol_number: + SHIFT(25); case ts_aux_token1: SHIFT(3); default: diff --git a/spec/runtime/json_spec.cpp b/spec/runtime/json_spec.cpp index a415784d..b373dd06 100644 --- a/spec/runtime/json_spec.cpp +++ b/spec/runtime/json_spec.cpp @@ -14,7 +14,7 @@ describe("json", []() { }); it("parses strings", [&]() { - TSDocumentSetText(document, "\"this is a string\""); + TSDocumentSetText(document, "\"this is a \\\"string\\\" \""); AssertThat(string(TSDocumentToString(document)), Equals("(value (string))")); }); diff --git a/src/compiler/build_tables/item_set_transitions.cpp b/src/compiler/build_tables/item_set_transitions.cpp index d3ee0335..6e7d4928 100644 --- a/src/compiler/build_tables/item_set_transitions.cpp +++ b/src/compiler/build_tables/item_set_transitions.cpp @@ -1,8 +1,8 @@ #include "item_set_transitions.h" #include "item_set_closure.h" #include "rule_transitions.h" +#include "merge_transitions.h" -using std::dynamic_pointer_cast; using std::make_shared; using std::shared_ptr; @@ -19,7 +19,7 @@ namespace tree_sitter { transition_map char_transitions(const LexItemSet &item_set, const Grammar &grammar) { transition_map result; - for (LexItem item : item_set) { + for (const LexItem &item : item_set) { transition_map item_transitions; for (auto transition : char_transitions(item.rule)) { auto rule = transition.first; @@ -28,7 +28,7 @@ namespace tree_sitter { item_transitions.add(rule, make_shared(new_item_set)); } - result.merge(item_transitions, [](shared_ptr left, shared_ptr right) -> shared_ptr { + result = merge_char_transitions(result, item_transitions, [](shared_ptr left, shared_ptr right) { return make_shared(merge_sets(*left, *right)); }); } @@ -38,7 +38,7 @@ namespace tree_sitter { transition_map sym_transitions(const ParseItemSet &item_set, const Grammar &grammar) { transition_map result; - for (ParseItem item : item_set) { + for (const ParseItem &item : item_set) { transition_map item_transitions; for (auto transition : sym_transitions(item.rule)) { auto rule = transition.first; @@ -49,7 +49,7 @@ namespace tree_sitter { item_transitions.add(rule, make_shared(new_item_set)); } - result.merge(item_transitions, [](shared_ptr left, shared_ptr right) -> shared_ptr { + result = merge_sym_transitions(result, item_transitions, [](shared_ptr left, shared_ptr right) { return make_shared(merge_sets(*left, *right)); }); } diff --git a/src/compiler/build_tables/merge_transitions.h b/src/compiler/build_tables/merge_transitions.h new file mode 100644 index 00000000..8557402d --- /dev/null +++ b/src/compiler/build_tables/merge_transitions.h @@ -0,0 +1,56 @@ +#ifndef __tree_sitter__merge_transitions__ +#define __tree_sitter__merge_transitions__ + +#include "transition_map.h" +#include "character_set.h" +#include "symbol.h" + +namespace tree_sitter { + namespace build_tables { + template + transition_map + merge_sym_transitions(const transition_map &left, + const transition_map &right, + std::function(std::shared_ptr, std::shared_ptr)> merge_fn) { + transition_map result(left); + for (auto &pair : right) { + auto rule = pair.first; + bool merged = false; + for (auto &existing_pair : result) { + auto existing_rule = existing_pair.first; + if (existing_rule->operator==(*rule)) { + existing_pair.second = merge_fn(existing_pair.second, pair.second); + merged = true; + break; + } + } + if (!merged) + result.add(pair.first, pair.second); + } + return result; + } + + template + transition_map + merge_char_transitions(const transition_map &left, + const transition_map &right, + std::function(std::shared_ptr, std::shared_ptr)> merge_fn) { + transition_map result(left); + for (auto &pair : right) { + auto rule = pair.first; + for (auto &existing_pair : left) { + auto existing_rule = existing_pair.first; + auto intersection = existing_rule->remove_set(*rule); + if (!intersection.is_empty()) { + rule->remove_set(intersection); + result.add(std::make_shared(intersection), merge_fn(existing_pair.second, pair.second)); + } + } + result.add(rule, pair.second); + } + return result; + } + } +} + +#endif diff --git a/src/compiler/build_tables/rule_transitions.cpp b/src/compiler/build_tables/rule_transitions.cpp index 251ee0bc..aa61bdad 100644 --- a/src/compiler/build_tables/rule_transitions.cpp +++ b/src/compiler/build_tables/rule_transitions.cpp @@ -1,5 +1,6 @@ #include "rule_transitions.h" #include "rules.h" +#include "merge_transitions.h" using namespace tree_sitter::rules; @@ -9,6 +10,23 @@ namespace tree_sitter { return typeid(*rule) == typeid(Blank); } + template + transition_map merge_transitions(const transition_map &left, const transition_map &right); + + template<> + transition_map merge_transitions(const transition_map &left, const transition_map &right) { + return merge_char_transitions(left, right, [](rule_ptr left, rule_ptr right) -> rule_ptr { + return choice({ left, right }); + }); + } + + template<> + transition_map merge_transitions(const transition_map &left, const transition_map &right) { + return merge_sym_transitions(left, right, [](rule_ptr left, rule_ptr right) -> rule_ptr { + return choice({ left, right }); + }); + } + template class TransitionsVisitor : public rules::Visitor { public: @@ -23,7 +41,7 @@ namespace tree_sitter { void visit_atom(const Rule *rule) { auto atom = dynamic_cast(rule); if (atom) { - value = transition_map({{ std::make_shared(*atom), blank() }}); + value = transition_map({{ std::make_shared(*atom), blank() }}); } } @@ -37,9 +55,7 @@ namespace tree_sitter { void visit(const Choice *rule) { value = transitions(rule->left); - value.merge(transitions(rule->right), [&](rule_ptr left, rule_ptr right) -> rule_ptr { - return choice({ left, right }); - }); + value = merge_transitions(transitions(rule->left), transitions(rule->right)); } void visit(const Seq *rule) { @@ -50,9 +66,7 @@ namespace tree_sitter { return seq({ left_rule, rule->right }); }); if (rule_can_be_blank(rule->left)) { - value.merge(transitions(rule->right), [&](rule_ptr left, rule_ptr right) -> rule_ptr { - return choice({ left, right }); - }); + value = merge_transitions(value, transitions(rule->right)); } } diff --git a/src/compiler/build_tables/transition_map.h b/src/compiler/build_tables/transition_map.h index 46e097e0..a9311eb4 100644 --- a/src/compiler/build_tables/transition_map.h +++ b/src/compiler/build_tables/transition_map.h @@ -8,13 +8,14 @@ namespace tree_sitter { template class transition_map { - typedef std::shared_ptr TKeyPtr; - typedef std::shared_ptr TValuePtr; - typedef std::pair pair_type; + typedef std::shared_ptr TKeyPtr; + typedef std::shared_ptr TValuePtr; + typedef std::pair pair_type; typedef std::vector contents_type; + contents_type contents; + public: - transition_map() : contents(contents_type()) {}; transition_map(std::vector pairs) : contents(pairs) {}; @@ -33,15 +34,6 @@ namespace tree_sitter { contents.push_back(pair_type(key, value)); } - void merge(const transition_map &other, std::function merge_fn) { - for (pair_type other_pair : other) { - if (pair_type *current_pair = pair_for_key(*other_pair.first)) - current_pair->second = merge_fn(current_pair->second, other_pair.second); - else - add(other_pair.first, other_pair.second); - } - } - TValuePtr operator[](const TKey &key) const { for (auto pair : *this) { if (*pair.first == key) { @@ -52,7 +44,7 @@ namespace tree_sitter { } template - transition_map map(std::function(TValuePtr)> map_fn) { + transition_map map(std::function(TValuePtr)> map_fn) { transition_map result; for (pair_type pair : *this) { auto new_value = map_fn(pair.second); @@ -70,18 +62,6 @@ namespace tree_sitter { const_iterator begin() const { return contents.begin(); } const_iterator end() const { return contents.end(); } size_t size() const { return contents.size(); } - - private: - - pair_type * pair_for_key(const TKey &key) { - for (int i = 0; i < contents.size(); i++) { - pair_type *pair = &contents[i]; - if (*pair->first == key) return pair; - } - return NULL; - } - - contents_type contents; }; template diff --git a/src/compiler/generate_code/c_code.cpp b/src/compiler/generate_code/c_code.cpp index 41b6d925..bcb3bdd3 100644 --- a/src/compiler/generate_code/c_code.cpp +++ b/src/compiler/generate_code/c_code.cpp @@ -98,6 +98,8 @@ namespace tree_sitter { return "\\0"; case '"': return "\\\""; + case '\\': + return "\\\\"; default: return string() + character; } @@ -108,16 +110,20 @@ namespace tree_sitter { if (range.min == range.max) { return lookahead + " == '" + character_code(range.min) + "'"; } else { - return string("'") + range.min + string("' <= ") + lookahead + - " && " + lookahead + " <= '" + range.max + "'"; + return string("'") + character_code(range.min) + string("' <= ") + lookahead + + " && " + lookahead + " <= '" + character_code(range.max) + "'"; } } string condition_for_character_set(const rules::CharacterSet &set) { vector parts; - for (auto &match : set.ranges) - parts.push_back("(" + condition_for_character_range(match) + ")"); - return join(parts, " ||\n "); + if (set.ranges.size() == 1) { + return condition_for_character_range(*set.ranges.begin()); + } else { + for (auto &match : set.ranges) + parts.push_back("(" + condition_for_character_range(match) + ")"); + return join(parts, " ||\n "); + } } string condition_for_character_rule(const rules::CharacterSet &rule) { diff --git a/src/compiler/rules/character_set.cpp b/src/compiler/rules/character_set.cpp index 7db04b79..4d6bd7d6 100644 --- a/src/compiler/rules/character_set.cpp +++ b/src/compiler/rules/character_set.cpp @@ -154,6 +154,10 @@ namespace tree_sitter { return removed_set; } + bool CharacterSet::is_empty() const { + return ranges.empty(); + } + void CharacterSet::add_set(const CharacterSet &other) { for (auto &other_range : other.ranges) { add_range(this, other_range); diff --git a/src/compiler/rules/character_set.h b/src/compiler/rules/character_set.h index d8d33ff9..1870a232 100644 --- a/src/compiler/rules/character_set.h +++ b/src/compiler/rules/character_set.h @@ -38,6 +38,7 @@ namespace tree_sitter { CharacterSet complement() const; CharacterSet intersect(const CharacterSet &) const; std::pair most_compact_representation() const; + bool is_empty() const; void add_set(const CharacterSet &other); CharacterSet remove_set(const CharacterSet &other); @@ -51,7 +52,7 @@ namespace tree_sitter { std::set ranges; }; - typedef std::shared_ptr char_ptr; + typedef std::shared_ptr char_ptr; } } diff --git a/src/compiler/rules/rule.h b/src/compiler/rules/rule.h index 00014f07..7f583f74 100644 --- a/src/compiler/rules/rule.h +++ b/src/compiler/rules/rule.h @@ -8,7 +8,7 @@ namespace tree_sitter { class Visitor; class Rule; - typedef std::shared_ptr rule_ptr; + typedef std::shared_ptr rule_ptr; class Rule { public: diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 7b4d1d8b..27e3358c 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -23,7 +23,7 @@ namespace tree_sitter { bool is_auxiliary; }; - typedef std::shared_ptr sym_ptr; + typedef std::shared_ptr sym_ptr; } } diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index 41f3d836..b6ffe7bd 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -101,6 +101,7 @@ 125120A218307FFD00C9B56A /* test_grammars.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = test_grammars.h; path = spec/fixtures/grammars/test_grammars.h; sourceTree = SOURCE_ROOT; }; 125120A3183083BD00C9B56A /* arithmetic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arithmetic.cpp; path = spec/fixtures/grammars/arithmetic.cpp; sourceTree = SOURCE_ROOT; }; 12661BF318A1505A00A259FB /* character_set_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character_set_spec.cpp; sourceTree = SOURCE_ROOT; }; + 127528AF18A6F9C6006B682B /* merge_transitions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = merge_transitions.h; sourceTree = ""; }; 12AB465D188BD03E00DE79DF /* follow_sets.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = follow_sets.cpp; sourceTree = ""; }; 12AB465E188BD03E00DE79DF /* follow_sets.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = follow_sets.h; sourceTree = ""; }; 12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = item_set_closure_spec.cpp; sourceTree = ""; }; @@ -228,6 +229,7 @@ 12EDCFBF18820880005A7A07 /* item_set_closure.h */, 12EDCFC118820A70005A7A07 /* item_set_transitions.cpp */, 12EDCFC218820A70005A7A07 /* item_set_transitions.h */, + 127528AF18A6F9C6006B682B /* merge_transitions.h */, 12EDCFA418820137005A7A07 /* perform.cpp */, 12EDCFA518820137005A7A07 /* perform.h */, 12EDCFA618820137005A7A07 /* rule_transitions.cpp */,