From d3d25f2683d5f175608430c4fe2bc36e999c253c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 5 Feb 2014 18:56:04 -0800 Subject: [PATCH] Represent character sets as sets of character ranges --- character_set_spec.cpp | 51 ++++ spec/compiler/build_tables/perform_spec.cpp | 20 +- .../build_tables/rule_transitions_spec.cpp | 9 - spec/compiler/rules/pattern_spec.cpp | 12 +- spec/fixtures/parsers/arithmetic.c | 198 +++++++------- spec/fixtures/parsers/json.c | 256 +++++++++--------- .../build_tables/item_set_transitions.h | 2 +- src/compiler/generate_code/c_code.cpp | 52 ++-- src/compiler/lex_table.h | 2 +- src/compiler/rules/character.cpp | 84 ------ src/compiler/rules/character.h | 88 ------ src/compiler/rules/character_set.cpp | 128 +++++++++ src/compiler/rules/character_set.h | 64 +++++ src/compiler/rules/pattern.cpp | 47 ++-- src/compiler/rules/rules.cpp | 14 +- src/compiler/rules/rules.h | 7 +- tree_sitter.xcodeproj/project.pbxproj | 16 +- 17 files changed, 551 insertions(+), 499 deletions(-) create mode 100644 character_set_spec.cpp delete mode 100644 src/compiler/rules/character.cpp delete mode 100644 src/compiler/rules/character.h create mode 100644 src/compiler/rules/character_set.cpp create mode 100644 src/compiler/rules/character_set.h diff --git a/character_set_spec.cpp b/character_set_spec.cpp new file mode 100644 index 00000000..b01b9e13 --- /dev/null +++ b/character_set_spec.cpp @@ -0,0 +1,51 @@ +#include "spec_helper.h" +#include "rules.h" + +using namespace rules; + +START_TEST + +describe("character sets", []() { + describe("computing the complement", []() { + it("works for the set containing only the null character", []() { + CharacterSet set1({ '\0' }); + auto set2 = set1.complement(); + AssertThat(set2, Equals(CharacterSet({ + { 1, -1 }, + }, true))); + AssertThat(set2.complement(), Equals(set1)); + }); + + it("works for single character sets", []() { + CharacterSet set1({ 'b' }); + auto set2 = set1.complement(); + AssertThat(set2, Equals(CharacterSet({ + { 0, 'a' }, + { 'c', -1 }, + }))); + AssertThat(set2.complement(), Equals(set1)); + }); + }); + + describe("computing unions", []() { + it("works for disjoint sets", []() { + CharacterSet set({ {'a', 'z'} }, true); + set.union_with(CharacterSet({ {'A', 'Z'} }, true)); + AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'}, }))); + }); + + it("works for sets with adjacent ranges", []() { + CharacterSet set({ {'a', 'r'} }, true); + set.union_with(CharacterSet({ {'s', 'z'} }, true)); + AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true))); + }); + + it("works when the result becomes a continuous range", []() { + CharacterSet set({ {'a', 'd'}, {'f', 'z'} }, true); + set.union_with(CharacterSet({ {'d', 'f'} }, true)); + AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true))); + }); + }); +}); + +END_TEST \ No newline at end of file diff --git a/spec/compiler/build_tables/perform_spec.cpp b/spec/compiler/build_tables/perform_spec.cpp index e88c1863..2a694a8e 100644 --- a/spec/compiler/build_tables/perform_spec.cpp +++ b/spec/compiler/build_tables/perform_spec.cpp @@ -16,14 +16,6 @@ static unordered_set keys(const unordered_map &ma return result; } -static unordered_set keys(const unordered_map &map) { - unordered_set result; - for (auto pair : map) { - result.insert(pair.first); - } - return result; -} - START_TEST describe("building parse and lex tables", []() { @@ -79,16 +71,10 @@ describe("building parse and lex tables", []() { Symbol("left-paren"), }))); - AssertThat(keys(lex_state(0).actions), Equals(unordered_set({ - CharacterSet('('), - CharacterSet(CharClassDigit), - CharacterSet(CharClassWord), - }))); - AssertThat(lex_state(0).expected_inputs(), Equals(unordered_set({ - CharacterSet('('), - CharacterSet(CharClassDigit), - CharacterSet(CharClassWord), + CharacterSet({ '(' }, true), + CharacterSet({ {'0', '9'} }, true), + CharacterSet({ {'a', 'z'}, {'A', 'Z'} }, true), }))); }); diff --git a/spec/compiler/build_tables/rule_transitions_spec.cpp b/spec/compiler/build_tables/rule_transitions_spec.cpp index 54e2b26a..56606849 100644 --- a/spec/compiler/build_tables/rule_transitions_spec.cpp +++ b/spec/compiler/build_tables/rule_transitions_spec.cpp @@ -29,15 +29,6 @@ describe("rule transitions", []() { }))); }); - it("handles character classes", [&]() { - auto rule = character(CharClassDigit); - AssertThat( - rule_transitions(rule), - Equals(transition_map({ - { rule, blank() } - }))); - }); - it("handles choices", [&]() { AssertThat( rule_transitions(choice({ symbol1, symbol2 })), diff --git a/spec/compiler/rules/pattern_spec.cpp b/spec/compiler/rules/pattern_spec.cpp index f8197e97..f64ffeca 100644 --- a/spec/compiler/rules/pattern_spec.cpp +++ b/spec/compiler/rules/pattern_spec.cpp @@ -22,9 +22,9 @@ describe("parsing pattern rules", []() { AssertThat( rule.to_rule_tree(), EqualsPointer(seq({ - character(CharClassWord), + character({ {'a', 'z'}, {'A', 'Z'} }), character('-'), - character(CharClassDigit) + character({ {'0', '9'} }) }))); }); @@ -49,24 +49,24 @@ describe("parsing pattern rules", []() { }); it("parses character sets", []() { - Pattern rule("[abc]"); + Pattern rule("[aAeE]"); AssertThat( rule.to_rule_tree(), - EqualsPointer(character({ 'a', 'b', 'c' }, true))); + EqualsPointer(character({ 'a', 'A', 'e', 'E' }))); }); it("parses character ranges", []() { Pattern rule("[12a-dA-D3]"); AssertThat( rule.to_rule_tree(), - EqualsPointer(character({ '1', '2', CharacterRange({'a', 'd'}), CharacterRange({ 'A', 'D' }), '3' }, true))); + EqualsPointer(character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, }))); }); it("parses negated characters", []() { Pattern rule("[^a\\d]"); AssertThat( rule.to_rule_tree(), - EqualsPointer(character({ 'a', CharClassDigit }, false))); + EqualsPointer(character({ {'a'}, {'0', '9'} }, false))); }); it("parses backslashes", []() { diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 32e4e567..7f5c2b57 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -2,28 +2,28 @@ #include enum ts_symbol { - ts_symbol_plus, - ts_symbol_number, - ts_symbol_variable, - ts_symbol_factor, - ts_symbol_times, ts_aux_token1, - ts_symbol_term, - ts_symbol_expression, + ts_symbol_plus, ts_aux_token2, + ts_symbol_variable, + ts_symbol_times, + ts_symbol_factor, + ts_symbol_term, + ts_symbol_number, + ts_symbol_expression, ts_symbol___END__, }; static const char *ts_symbol_names[] = { - "plus", - "number", - "variable", - "factor", - "times", "token1", - "term", - "expression", + "plus", "token2", + "variable", + "times", + "factor", + "term", + "number", + "expression", "__END__", }; @@ -33,7 +33,7 @@ static void ts_lex(TSParser *parser) { case 0: if ((LOOKAHEAD_CHAR() == '\0')) ADVANCE(1); - LEX_ERROR(1, EXPECT({""})); + LEX_ERROR(1, EXPECT({"\0"})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: @@ -41,13 +41,13 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); if ((LOOKAHEAD_CHAR() == '\0')) ADVANCE(1); - LEX_ERROR(2, EXPECT({"'*'", ""})); + LEX_ERROR(2, EXPECT({"\0", "*"})); case 3: ACCEPT_TOKEN(ts_symbol_times); case 4: if ((LOOKAHEAD_CHAR() == ')')) ADVANCE(5); - LEX_ERROR(1, EXPECT({"')'"})); + LEX_ERROR(1, EXPECT({")"})); case 5: ACCEPT_TOKEN(ts_aux_token2); case 6: @@ -55,7 +55,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if ((LOOKAHEAD_CHAR() == '*')) ADVANCE(3); - LEX_ERROR(2, EXPECT({"')'", "'*'"})); + LEX_ERROR(1, EXPECT({")-*"})); case 7: if ((LOOKAHEAD_CHAR() == ')')) ADVANCE(5); @@ -63,7 +63,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); if ((LOOKAHEAD_CHAR() == '+')) ADVANCE(8); - LEX_ERROR(3, EXPECT({"')'", "'*'", "'+'"})); + LEX_ERROR(1, EXPECT({")-+"})); case 8: ACCEPT_TOKEN(ts_symbol_plus); case 9: @@ -71,18 +71,18 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if ((LOOKAHEAD_CHAR() == '+')) ADVANCE(8); - LEX_ERROR(2, EXPECT({"')'", "'+'"})); + LEX_ERROR(2, EXPECT({")", "+"})); case 10: - if ((LOOKAHEAD_CHAR() == '(')) - ADVANCE(12); if (('A' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'Z') || ('a' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'z')) ADVANCE(13); - if ((isdigit(LOOKAHEAD_CHAR()))) + if ((LOOKAHEAD_CHAR() == '(')) + ADVANCE(12); + if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) ADVANCE(11); - LEX_ERROR(4, EXPECT({"'A'-'Z'", "'('", "'a'-'z'", ""})); + LEX_ERROR(4, EXPECT({"(", "0-9", "A-Z", "a-z"})); case 11: - if ((isdigit(LOOKAHEAD_CHAR()))) + if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) ADVANCE(11); ACCEPT_TOKEN(ts_symbol_number); case 12: @@ -97,7 +97,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if ((LOOKAHEAD_CHAR() == '\0')) ADVANCE(1); - LEX_ERROR(2, EXPECT({"'+'", ""})); + LEX_ERROR(2, EXPECT({"\0", "+"})); case 15: if ((LOOKAHEAD_CHAR() == '*')) ADVANCE(3); @@ -105,7 +105,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if ((LOOKAHEAD_CHAR() == '\0')) ADVANCE(1); - LEX_ERROR(3, EXPECT({"'*'", "'+'", ""})); + LEX_ERROR(2, EXPECT({"\0", "*-+"})); default: LEX_PANIC(); } @@ -118,20 +118,20 @@ static TSParseResult ts_parse(const char *input) { case 0: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(45); - case ts_symbol_term: - SHIFT(2); case ts_aux_token1: SHIFT(42); + case ts_symbol_expression: + SHIFT(1); case ts_symbol_number: SHIFT(41); case ts_symbol_variable: SHIFT(41); - case ts_symbol_expression: - SHIFT(1); + case ts_symbol_factor: + SHIFT(45); + case ts_symbol_term: + SHIFT(2); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 1: SET_LEX_STATE(0); @@ -188,20 +188,20 @@ static TSParseResult ts_parse(const char *input) { case 6: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(32); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(32); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 7: SET_LEX_STATE(9); @@ -250,50 +250,50 @@ static TSParseResult ts_parse(const char *input) { case 11: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(23); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(23); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 12: SET_LEX_STATE(7); switch (LOOKAHEAD_SYM()) { - case ts_aux_token2: - REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); case ts_symbol_times: REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); + case ts_aux_token2: + REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); case ts_symbol_plus: REDUCE(ts_symbol_factor, 1, COLLAPSE({0})); default: - PARSE_ERROR(3, EXPECT({"plus", "times", "token2"})); + PARSE_ERROR(3, EXPECT({"plus", "token2", "times"})); } case 13: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(14); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(14); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 14: SET_LEX_STATE(4); @@ -306,14 +306,14 @@ static TSParseResult ts_parse(const char *input) { case 15: SET_LEX_STATE(7); switch (LOOKAHEAD_SYM()) { - case ts_aux_token2: - REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); case ts_symbol_times: REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); + case ts_aux_token2: + REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); case ts_symbol_plus: REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1})); default: - PARSE_ERROR(3, EXPECT({"plus", "times", "token2"})); + PARSE_ERROR(3, EXPECT({"plus", "token2", "times"})); } case 16: SET_LEX_STATE(7); @@ -354,20 +354,20 @@ static TSParseResult ts_parse(const char *input) { case 19: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(20); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(20); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 20: SET_LEX_STATE(4); @@ -450,20 +450,20 @@ static TSParseResult ts_parse(const char *input) { case 28: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(29); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(29); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 29: SET_LEX_STATE(4); @@ -542,20 +542,20 @@ static TSParseResult ts_parse(const char *input) { case 37: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(38); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(38); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 38: SET_LEX_STATE(4); @@ -596,20 +596,20 @@ static TSParseResult ts_parse(const char *input) { case 42: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(43); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(43); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 43: SET_LEX_STATE(4); @@ -634,14 +634,14 @@ static TSParseResult ts_parse(const char *input) { case 45: SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { - case ts_symbol___END__: - REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_symbol_plus: REDUCE(ts_symbol_term, 1, COLLAPSE({0})); + case ts_symbol___END__: + REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_symbol_times: SHIFT(46); default: - PARSE_ERROR(3, EXPECT({"times", "plus", "__END__"})); + PARSE_ERROR(3, EXPECT({"times", "__END__", "plus"})); } case 46: SET_LEX_STATE(10); @@ -670,20 +670,20 @@ static TSParseResult ts_parse(const char *input) { case 48: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_factor: - SHIFT(16); - case ts_symbol_term: - SHIFT(7); case ts_aux_token1: SHIFT(13); + case ts_symbol_expression: + SHIFT(49); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: SHIFT(12); - case ts_symbol_expression: - SHIFT(49); + case ts_symbol_factor: + SHIFT(16); + case ts_symbol_term: + SHIFT(7); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"})); } case 49: SET_LEX_STATE(4); diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index bc358c60..ed4722ee 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -2,38 +2,38 @@ #include enum ts_symbol { - ts_symbol_string, - ts_symbol_array, - ts_aux_token3, - ts_symbol_object, - ts_aux_repeat_helper1, ts_aux_token6, ts_symbol_number, + ts_aux_repeat_helper1, ts_aux_token7, - ts_aux_token5, - ts_aux_token4, - ts_symbol___END__, - ts_aux_token2, ts_aux_repeat_helper2, + ts_aux_token5, ts_symbol_value, + ts_symbol___END__, + ts_aux_token3, + ts_aux_token2, + ts_symbol_string, + ts_symbol_object, + ts_aux_token4, + ts_symbol_array, ts_aux_token1, }; static const char *ts_symbol_names[] = { - "string", - "array", - "token3", - "object", - "repeat_helper1", "token6", "number", + "repeat_helper1", "token7", - "token5", - "token4", - "__END__", - "token2", "repeat_helper2", + "token5", "value", + "__END__", + "token3", + "token2", + "string", + "object", + "token4", + "array", "token1", }; @@ -43,7 +43,7 @@ static void ts_lex(TSParser *parser) { case 0: if ((LOOKAHEAD_CHAR() == '\0')) ADVANCE(1); - LEX_ERROR(1, EXPECT({""})); + LEX_ERROR(1, EXPECT({"\0"})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: @@ -55,7 +55,7 @@ static void ts_lex(TSParser *parser) { case 4: if ((LOOKAHEAD_CHAR() == ']')) ADVANCE(5); - LEX_ERROR(1, EXPECT({"']'"})); + LEX_ERROR(1, EXPECT({"]"})); case 5: ACCEPT_TOKEN(ts_aux_token4); case 6: @@ -63,11 +63,11 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if ((LOOKAHEAD_CHAR() == ',')) ADVANCE(3); - LEX_ERROR(2, EXPECT({"']'", "','"})); + LEX_ERROR(2, EXPECT({",", "]"})); case 7: if ((LOOKAHEAD_CHAR() == '}')) ADVANCE(8); - LEX_ERROR(1, EXPECT({"'}'"})); + LEX_ERROR(1, EXPECT({"}"})); case 8: ACCEPT_TOKEN(ts_aux_token7); case 9: @@ -75,31 +75,31 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if ((LOOKAHEAD_CHAR() == ',')) ADVANCE(3); - LEX_ERROR(2, EXPECT({"'}'", "','"})); + LEX_ERROR(2, EXPECT({",", "}"})); case 10: - if ((LOOKAHEAD_CHAR() == '{')) - ADVANCE(16); if ((LOOKAHEAD_CHAR() == '[')) ADVANCE(15); + if ((LOOKAHEAD_CHAR() == '{')) + ADVANCE(16); if ((LOOKAHEAD_CHAR() == '\"')) ADVANCE(12); - if ((isdigit(LOOKAHEAD_CHAR()))) + if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) ADVANCE(11); - LEX_ERROR(4, EXPECT({"'{'", "'['", "'\"'", ""})); + LEX_ERROR(4, EXPECT({"\"", "0-9", "[", "{"})); case 11: - if ((isdigit(LOOKAHEAD_CHAR()))) + if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')) ADVANCE(11); ACCEPT_TOKEN(ts_symbol_number); case 12: if (!((LOOKAHEAD_CHAR() == '\"'))) ADVANCE(13); - LEX_ERROR(1, EXPECT({"'\"'"})); + LEX_ERROR(1, EXPECT({"<-!"})); case 13: if ((LOOKAHEAD_CHAR() == '\"')) ADVANCE(14); if (!((LOOKAHEAD_CHAR() == '\"'))) ADVANCE(13); - LEX_ERROR(1, EXPECT({"'\"'"})); + LEX_ERROR(1, EXPECT({"<-\""})); case 14: ACCEPT_TOKEN(ts_symbol_string); case 15: @@ -109,13 +109,13 @@ static void ts_lex(TSParser *parser) { case 17: if ((LOOKAHEAD_CHAR() == ':')) ADVANCE(18); - LEX_ERROR(1, EXPECT({"':'"})); + LEX_ERROR(1, EXPECT({":"})); case 18: ACCEPT_TOKEN(ts_aux_token6); case 19: if ((LOOKAHEAD_CHAR() == '\"')) ADVANCE(12); - LEX_ERROR(1, EXPECT({"'\"'"})); + LEX_ERROR(1, EXPECT({"\""})); default: LEX_PANIC(); } @@ -128,22 +128,22 @@ static TSParseResult ts_parse(const char *input) { case 0: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(53); case ts_symbol_string: SHIFT(53); + case ts_aux_token1: + SHIFT(2); + case ts_symbol_object: + SHIFT(53); + case ts_symbol_value: + SHIFT(1); + case ts_aux_token5: + SHIFT(47); + case ts_symbol_number: + SHIFT(53); case ts_symbol_array: SHIFT(53); - case ts_symbol_object: - SHIFT(53); - case ts_aux_token5: - SHIFT(47); - case ts_aux_token1: - SHIFT(2); - case ts_symbol_value: - SHIFT(1); default: - PARSE_ERROR(7, EXPECT({"value", "token1", "array", "string", "token5", "object", "number"})); + PARSE_ERROR(7, EXPECT({"array", "number", "token5", "value", "object", "token1", "string"})); } case 1: SET_LEX_STATE(0); @@ -156,42 +156,42 @@ static TSParseResult ts_parse(const char *input) { case 2: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(44); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 3: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(4); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 4: SET_LEX_STATE(2); @@ -226,42 +226,42 @@ static TSParseResult ts_parse(const char *input) { case 7: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(43); case ts_symbol_string: SHIFT(43); - case ts_aux_token5: - SHIFT(35); + case ts_aux_token1: + SHIFT(8); case ts_symbol_object: SHIFT(43); + case ts_symbol_number: + SHIFT(43); case ts_symbol_array: SHIFT(43); case ts_symbol_value: SHIFT(41); - case ts_aux_token1: - SHIFT(8); + case ts_aux_token5: + SHIFT(35); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 8: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(9); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 9: SET_LEX_STATE(2); @@ -312,34 +312,34 @@ static TSParseResult ts_parse(const char *input) { case 14: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(15); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 15: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { - case ts_aux_token3: - SHIFT(16); case ts_aux_token2: SHIFT(18); + case ts_aux_token3: + SHIFT(16); case ts_aux_repeat_helper1: SHIFT(16); default: - PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"})); } case 16: SET_LEX_STATE(7); @@ -378,42 +378,42 @@ static TSParseResult ts_parse(const char *input) { case 20: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(34); case ts_symbol_string: SHIFT(34); - case ts_aux_token5: - SHIFT(26); + case ts_aux_token1: + SHIFT(21); case ts_symbol_object: SHIFT(34); + case ts_symbol_number: + SHIFT(34); case ts_symbol_array: SHIFT(34); case ts_symbol_value: SHIFT(32); - case ts_aux_token1: - SHIFT(21); + case ts_aux_token5: + SHIFT(26); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 21: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(22); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 22: SET_LEX_STATE(2); @@ -474,34 +474,34 @@ static TSParseResult ts_parse(const char *input) { case 28: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(29); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 29: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { - case ts_aux_token3: - SHIFT(30); case ts_aux_token2: SHIFT(18); + case ts_aux_token3: + SHIFT(30); case ts_aux_repeat_helper1: SHIFT(30); default: - PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"})); } case 30: SET_LEX_STATE(7); @@ -570,34 +570,34 @@ static TSParseResult ts_parse(const char *input) { case 37: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(38); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 38: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { - case ts_aux_token3: - SHIFT(39); case ts_aux_token2: SHIFT(18); + case ts_aux_token3: + SHIFT(39); case ts_aux_repeat_helper1: SHIFT(39); default: - PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"})); } case 39: SET_LEX_STATE(7); @@ -620,14 +620,14 @@ static TSParseResult ts_parse(const char *input) { case 41: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_aux_token4: - REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_token2: SHIFT(7); + case ts_aux_token4: + REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_repeat_helper2: SHIFT(42); default: - PARSE_ERROR(3, EXPECT({"repeat_helper2", "token2", "token4"})); + PARSE_ERROR(3, EXPECT({"repeat_helper2", "token4", "token2"})); } case 42: SET_LEX_STATE(4); @@ -694,34 +694,34 @@ static TSParseResult ts_parse(const char *input) { case 49: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_symbol_number: - SHIFT(25); case ts_symbol_string: SHIFT(25); - case ts_aux_token5: - SHIFT(12); + case ts_aux_token1: + SHIFT(3); case ts_symbol_object: SHIFT(25); + case ts_symbol_number: + SHIFT(25); case ts_symbol_array: SHIFT(25); case ts_symbol_value: SHIFT(50); - case ts_aux_token1: - SHIFT(3); + case ts_aux_token5: + SHIFT(12); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"})); } case 50: SET_LEX_STATE(2); switch (LOOKAHEAD_SYM()) { - case ts_aux_token3: - SHIFT(51); case ts_aux_token2: SHIFT(18); + case ts_aux_token3: + SHIFT(51); case ts_aux_repeat_helper1: SHIFT(51); default: - PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"})); } case 51: SET_LEX_STATE(7); diff --git a/src/compiler/build_tables/item_set_transitions.h b/src/compiler/build_tables/item_set_transitions.h index 479ed174..fc0a52d6 100644 --- a/src/compiler/build_tables/item_set_transitions.h +++ b/src/compiler/build_tables/item_set_transitions.h @@ -1,7 +1,7 @@ #ifndef __tree_sitter__item_set_transitions__ #define __tree_sitter__item_set_transitions__ -#include "character.h" +#include "character_set.h" #include "symbol.h" #include "transition_map.h" #include "item.h" diff --git a/src/compiler/generate_code/c_code.cpp b/src/compiler/generate_code/c_code.cpp index 33e0e79a..eb36b4f5 100644 --- a/src/compiler/generate_code/c_code.cpp +++ b/src/compiler/generate_code/c_code.cpp @@ -8,6 +8,8 @@ using std::to_string; using std::unordered_map; using std::unordered_set; using std::vector; +using std::set; +using std::pair; namespace tree_sitter { namespace generate_code { @@ -101,33 +103,30 @@ namespace tree_sitter { } } - string condition_for_character_match(const rules::CharacterRange &match) { + string condition_for_character_range(const rules::CharacterRange &range) { string lookahead("LOOKAHEAD_CHAR()"); - auto value = match.value; - switch (match.type) { - case rules::CharacterRangeTypeClass: - switch (value.character_class) { - case rules::CharClassDigit: - return string("isdigit(") + lookahead + ")"; - case rules::CharClassWord: - return string("isalnum(") + lookahead + ")"; - } - case rules::CharacterRangeTypeSpecific: - return lookahead + " == '" + character_code(value.character) + "'"; - case rules::CharacterRangeTypeRange: - return string("'") + value.range.min_character + string("' <= ") + lookahead + - " && " + lookahead + " <= '" + value.range.max_character + "'"; + if (range.min == range.max) { + return lookahead + " == '" + character_code(range.min) + "'"; + } else { + return string("'") + range.min + string("' <= ") + lookahead + + " && " + lookahead + " <= '" + range.max + "'"; } } + string condition_for_character_set(const rules::CharacterSet &set) { + vector parts; + for (auto &match : set.ranges) + parts.push_back("(" + condition_for_character_range(match) + ")"); + return join(parts, " ||\n "); + } + string condition_for_character_rule(const rules::CharacterSet &rule) { vector parts; - for (auto &match : rule.ranges) { - parts.push_back("(" + condition_for_character_match(match) + ")"); - } - string result = join(parts, " ||\n "); - if (!rule.sign) result = "!(" + result + ")"; - return result; + pair representation = rule.most_compact_representation(); + if (representation.second) + return condition_for_character_set(representation.first); + else + return "!(" + condition_for_character_set(rule.complement()) + ")"; } string collapse_flags(vector flags) { @@ -177,17 +176,16 @@ namespace tree_sitter { } string lex_error_call(const unordered_set &expected_inputs) { - unordered_set expected_matches; + rules::CharacterSet expected_set; for (auto &rule : expected_inputs) - for (auto &match : rule.ranges) - expected_matches.insert(match); + expected_set.union_with(rule); - string result = "LEX_ERROR(" + to_string(expected_matches.size()) + ", EXPECT({"; + string result = "LEX_ERROR(" + to_string(expected_set.ranges.size()) + ", EXPECT({"; bool started = false; - for (auto match : expected_matches) { + for (auto &ranges : expected_set.ranges) { if (started) result += ", "; started = true; - result += "\"" + escape_string(match.to_string()) + "\""; + result += "\"" + escape_string(ranges.to_string()) + "\""; } result += "}));"; return result; diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index b60594c2..ae14d38d 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -6,7 +6,7 @@ #include #include #include "symbol.h" -#include "character.h" +#include "character_set.h" namespace tree_sitter { typedef enum { diff --git a/src/compiler/rules/character.cpp b/src/compiler/rules/character.cpp deleted file mode 100644 index 42f7f08a..00000000 --- a/src/compiler/rules/character.cpp +++ /dev/null @@ -1,84 +0,0 @@ -#include "rules.h" - -using std::string; -using std::hash; - -namespace tree_sitter { - namespace rules { - CharacterRange::CharacterRange(char character) : type(CharacterRangeTypeSpecific) { value.character = character; } - CharacterRange::CharacterRange(CharacterClass klass) : type(CharacterRangeTypeClass) { value.character_class = klass; } - CharacterRange::CharacterRange(const std::pair bounds) : type(CharacterRangeTypeRange) { - value.range.min_character = bounds.first; - value.range.max_character = bounds.second; - } - - bool CharacterRange::operator==(const CharacterRange &right) const { - if (type != right.type) - return false; - switch (type) { - case CharacterRangeTypeClass: - return (value.character_class == right.value.character_class); - case CharacterRangeTypeSpecific: - return (value.character == right.value.character); - case CharacterRangeTypeRange: - return (value.range.min_character == right.value.range.min_character && - value.range.max_character == right.value.range.max_character); - } - } - - string CharacterRange::to_string() const { - switch (type) { - case CharacterRangeTypeClass: - switch (value.character_class) { - case CharClassDigit: - return ""; - case CharClassWord: - return ""; - } - case CharacterRangeTypeSpecific: - return (value.character == '\0') ? - "" : - string("'") + value.character + "'"; - case CharacterRangeTypeRange: - return (string("'") + - value.range.min_character + "'-'" + - value.range.max_character + "'"); - } - } - - CharacterSet::CharacterSet(char character) : ranges({ CharacterRange(character) }), sign(true) {} - CharacterSet::CharacterSet(CharacterClass char_class) : ranges({ CharacterRange(char_class) }), sign(true) {} - CharacterSet::CharacterSet(const std::unordered_set &ranges, bool sign) : ranges(ranges), sign(sign) {} - - bool CharacterSet::operator==(const Rule &rule) const { - const CharacterSet *other = dynamic_cast(&rule); - return other && this->operator==(*other); - } - - bool CharacterSet::operator==(const CharacterSet &other) const { - if (other.sign != sign) return false; - if (other.ranges != ranges) return false; - return true; - } - - size_t CharacterSet::hash_code() const { - return typeid(this).hash_code() ^ hash()(to_string()); - } - - rule_ptr CharacterSet::copy() const { - return std::make_shared(*this); - } - - string CharacterSet::to_string() const { - string prefix("#"; - } - - void CharacterSet::accept(Visitor &visitor) const { - visitor.visit(this); - } - } -} diff --git a/src/compiler/rules/character.h b/src/compiler/rules/character.h deleted file mode 100644 index b12c28e2..00000000 --- a/src/compiler/rules/character.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef __tree_sitter__character__ -#define __tree_sitter__character__ - -#include "rule.h" -#include - -namespace tree_sitter { - namespace rules { - typedef enum { - CharClassWord, - CharClassDigit - } CharacterClass; - - typedef enum { - CharacterRangeTypeSpecific, - CharacterRangeTypeClass, - CharacterRangeTypeRange, - } CharacterRangeType; - - struct CharacterRange { - CharacterRangeType type; - union { - CharacterClass character_class; - char character; - struct { - char min_character; - char max_character; - } range; - } value; - - CharacterRange(char); - CharacterRange(const std::pair); - CharacterRange(CharacterClass); - bool operator==(const CharacterRange &) const; - std::string to_string() const; - }; - } -} - -namespace std { - template<> - struct hash { - size_t operator()(const tree_sitter::rules::CharacterRange &match) const { - auto type = match.type; - auto result = hash()(type); - switch (type) { - case tree_sitter::rules::CharacterRangeTypeClass: - result ^= hash()(match.value.character_class); - case tree_sitter::rules::CharacterRangeTypeRange: - result ^= hash()(match.value.range.min_character); - result ^= hash()(match.value.range.max_character); - case tree_sitter::rules::CharacterRangeTypeSpecific: - result ^= hash()(match.value.character); - } - return result; - } - }; -} - -namespace tree_sitter { - namespace rules { - - class CharacterSet : public Rule { - public: - CharacterSet(char character); - CharacterSet(CharacterClass character_class); - CharacterSet(char min_character, char max_character); - CharacterSet(const std::unordered_set &matches, bool sign); - - bool operator==(const Rule& other) const; - bool operator==(const CharacterSet& other) const; - size_t hash_code() const; - rule_ptr copy() const; - std::string to_string() const; - void accept(Visitor &visitor) const; - - std::unordered_set ranges; - bool sign; - }; - } -} - -namespace std { - template<> - struct hash : hash {}; -} - -#endif diff --git a/src/compiler/rules/character_set.cpp b/src/compiler/rules/character_set.cpp new file mode 100644 index 00000000..7d7fe5ff --- /dev/null +++ b/src/compiler/rules/character_set.cpp @@ -0,0 +1,128 @@ +#include "rules.h" + +using std::string; +using std::hash; +using std::set; + +namespace tree_sitter { + namespace rules { + char MAX_CHAR = -1; + + CharacterRange::CharacterRange(char value) : min(value), max(value) {} + CharacterRange::CharacterRange(char min, char max) : min(min), max(max) {} + + bool CharacterRange::operator==(const CharacterRange &other) const { + return min == other.min && max == other.max; + } + + bool CharacterRange::operator<(const CharacterRange &other) const { + if (min < other.min) return true; + if (min > other.min) return false; + if (max < other.max) return true; + return false; + } + + string escape_character(char input) { + switch (input) { + case '\0': + return "\\0"; + default: + return string() + input; + } + } + + bool CharacterRange::is_adjacent(const CharacterRange &other) const { + return + (min <= other.min && max >= (other.min - 1)) || + (min <= (other.max + 1) && max >= other.max); + } + + void CharacterRange::add_range(const CharacterRange &other) { + if (other.min < min) min = other.min; + if (other.max > max) max = other.max; + } + + string CharacterRange::to_string() const { + if (min == max) { + return escape_character(min); + } else { + if (min == 0) + return string("<-") + max; + else if (max == MAX_CHAR) + return string() + min + "->"; + else + return string() + min + "-" + max; + } + } + + CharacterSet::CharacterSet() : ranges({}) {} + CharacterSet::CharacterSet(const set &ranges) : ranges(ranges) {} + CharacterSet::CharacterSet(const set &ranges, bool sign) : + ranges(sign ? ranges : CharacterSet(ranges).complement().ranges) {} + + bool CharacterSet::operator==(const Rule &rule) const { + const CharacterSet *other = dynamic_cast(&rule); + return other && (ranges == other->ranges); + } + + size_t CharacterSet::hash_code() const { + return typeid(this).hash_code() ^ hash()(to_string()); + } + + rule_ptr CharacterSet::copy() const { + return std::make_shared(*this); + } + + string CharacterSet::to_string() const { + string result("#"; + } + + CharacterSet CharacterSet::complement() const { + set result; + char current_char = 0; + for (auto &range : ranges) { + if (range.min != 0) + result.insert(CharacterRange(current_char, range.min - 1)); + current_char = range.max + 1; + } + if (current_char != 0) + result.insert(CharacterRange(current_char, MAX_CHAR)); + return CharacterSet(result); + } + + std::pair CharacterSet::most_compact_representation() const { + auto first_range = *ranges.begin(); + if (first_range.min == 0 && first_range.max > 0) { + return { this->complement(), false }; + } else { + return { *this, true }; + } + } + + void add_range(CharacterSet *self, CharacterRange new_range) { + set new_ranges; + for (auto range : self->ranges) { + if (range.is_adjacent(new_range)) { + new_range.add_range(range); + } else { + new_ranges.insert(range); + } + } + new_ranges.insert(new_range); + self->ranges = new_ranges; + } + + void CharacterSet::union_with(const CharacterSet &other) { + for (auto &other_range : other.ranges) { + add_range(this, other_range); + } + } + + void CharacterSet::accept(Visitor &visitor) const { + visitor.visit(this); + } + } +} diff --git a/src/compiler/rules/character_set.h b/src/compiler/rules/character_set.h new file mode 100644 index 00000000..5e472f30 --- /dev/null +++ b/src/compiler/rules/character_set.h @@ -0,0 +1,64 @@ +#ifndef __tree_sitter__character_set__ +#define __tree_sitter__character_set__ + +#include "rule.h" +#include + +namespace tree_sitter { + namespace rules { + struct CharacterRange { + char min; + char max; + + CharacterRange(char); + CharacterRange(char, char); + + bool operator==(const CharacterRange &) const; + bool operator<(const CharacterRange &) const; + bool is_adjacent(const CharacterRange &) const; + + void add_range(const CharacterRange &); + + std::string to_string() const; + }; + } +} + +namespace std { + template<> + struct hash { + size_t operator()(const tree_sitter::rules::CharacterRange &range) const { + return (hash()(range.min) ^ hash()(range.max)); + } + }; +} + +namespace tree_sitter { + namespace rules { + class CharacterSet : public Rule { + public: + CharacterSet(); + CharacterSet(const std::set &ranges); + CharacterSet(const std::set &ranges, bool); + + CharacterSet complement() const; + void union_with(const CharacterSet &other); + std::pair most_compact_representation() const; + + bool operator==(const Rule& other) const; + size_t hash_code() const; + rule_ptr copy() const; + std::string to_string() const; + void accept(Visitor &visitor) const; + + std::set ranges; + }; + } +} + +namespace std { + template<> + struct hash : hash {}; +} + +#endif diff --git a/src/compiler/rules/pattern.cpp b/src/compiler/rules/pattern.cpp index 38988fb0..1a600c93 100644 --- a/src/compiler/rules/pattern.cpp +++ b/src/compiler/rules/pattern.cpp @@ -2,6 +2,7 @@ using std::string; using std::hash; +using std::set; namespace tree_sitter { namespace rules { @@ -38,18 +39,6 @@ namespace tree_sitter { return result; } - rule_ptr char_set() { - bool is_affirmative = true; - if (peek() == '^') { - next(); - is_affirmative = false; - } - std::unordered_set matches; - while (has_more_input() && (peek() != ']')) - matches.insert(single_char()); - return character(matches, is_affirmative); - } - rule_ptr atom() { rule_ptr result; switch (peek()) { @@ -63,7 +52,7 @@ namespace tree_sitter { break; case '[': next(); - result = char_set(); + result = char_set().copy(); if (peek() != ']') error("mismatched square brackets"); else @@ -73,13 +62,25 @@ namespace tree_sitter { error("mismatched parens"); break; default: - result = character({ single_char() }, true); + result = single_char().copy(); } return result; } - CharacterRange single_char() { - CharacterRange value('\0'); + CharacterSet char_set() { + bool is_affirmative = true; + if (peek() == '^') { + next(); + is_affirmative = false; + } + CharacterSet result; + while (has_more_input() && (peek() != ']')) + result.union_with(single_char()); + return is_affirmative ? result : result.complement(); + } + + CharacterSet single_char() { + CharacterSet value({ '\0' }); switch (peek()) { case '\\': next(); @@ -91,28 +92,28 @@ namespace tree_sitter { next(); if (peek() == '-') { next(); - value = CharacterRange({ first_char, peek() }); + value = CharacterSet({ {first_char, peek()} }, true); next(); } else { - value = first_char; + value = CharacterSet({ first_char }); } } return value; } - CharacterRange escaped_char(char value) { + CharacterSet escaped_char(char value) { switch (value) { case '\\': case '(': case ')': - return value; + return CharacterSet({ value }); case 'w': - return CharClassWord; + return CharacterSet({{'a', 'z'}, {'A', 'Z'}}, true); case 'd': - return CharClassDigit; + return CharacterSet({{'0', '9'}}, true); default: error("unrecognized escape sequence"); - return '\0'; + return CharacterSet(); } } diff --git a/src/compiler/rules/rules.cpp b/src/compiler/rules/rules.cpp index 57ae749f..e0f29522 100644 --- a/src/compiler/rules/rules.cpp +++ b/src/compiler/rules/rules.cpp @@ -3,6 +3,7 @@ using std::make_shared; using std::string; using std::initializer_list; +using std::set; namespace tree_sitter { namespace rules { @@ -11,15 +12,16 @@ namespace tree_sitter { } rule_ptr character(char value) { - return make_shared(value); + set ranges = { value }; + return make_shared(ranges); } - rule_ptr character(CharacterClass value) { - return make_shared(value); + rule_ptr character(const set &ranges) { + return make_shared(ranges); } - - rule_ptr character(const std::unordered_set &matches, bool is_affirmative) { - return make_shared(matches, is_affirmative); + + rule_ptr character(const set &ranges, bool sign) { + return make_shared(ranges, sign); } rule_ptr choice(const initializer_list &rules) { diff --git a/src/compiler/rules/rules.h b/src/compiler/rules/rules.h index 80d2aa75..3129c72d 100644 --- a/src/compiler/rules/rules.h +++ b/src/compiler/rules/rules.h @@ -8,7 +8,7 @@ #include "seq.h" #include "string.h" #include "pattern.h" -#include "character.h" +#include "character_set.h" #include "repeat.h" #include "visitor.h" @@ -16,9 +16,8 @@ namespace tree_sitter { namespace rules { rule_ptr blank(); rule_ptr character(char value); - rule_ptr character(CharacterClass value); - rule_ptr character(const std::unordered_set &matches); - rule_ptr character(const std::unordered_set &matches, bool); + rule_ptr character(const std::set &matches); + rule_ptr character(const std::set &matches, bool); rule_ptr choice(const std::initializer_list &rules); rule_ptr pattern(const std::string &value); diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index a9a4eb15..41f3d836 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -7,7 +7,7 @@ objects = { /* Begin PBXBuildFile section */ - 12130605182C348F00FCF928 /* character.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130603182C348F00FCF928 /* character.cpp */; }; + 12130605182C348F00FCF928 /* character_set.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130603182C348F00FCF928 /* character_set.cpp */; }; 1213060B182C389100FCF928 /* symbol.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130609182C389100FCF928 /* symbol.cpp */; }; 1213060E182C398300FCF928 /* choice.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1213060C182C398300FCF928 /* choice.cpp */; }; 12130611182C3A1100FCF928 /* blank.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1213060F182C3A1100FCF928 /* blank.cpp */; }; @@ -17,6 +17,7 @@ 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */; }; 1251209B1830145300C9B56A /* rule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209A1830145300C9B56A /* rule.cpp */; }; 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; }; + 12661BF418A1505A00A259FB /* character_set_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12661BF318A1505A00A259FB /* character_set_spec.cpp */; }; 12AB465F188BD03E00DE79DF /* follow_sets.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12AB465D188BD03E00DE79DF /* follow_sets.cpp */; }; 12AB4661188CB3A300DE79DF /* item_set_closure_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */; }; 12BC470518822B27005AC502 /* parse_config.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12BC470318822A17005AC502 /* parse_config.cpp */; }; @@ -79,8 +80,8 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ - 12130603182C348F00FCF928 /* character.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character.cpp; sourceTree = ""; }; - 12130604182C348F00FCF928 /* character.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = character.h; sourceTree = ""; }; + 12130603182C348F00FCF928 /* character_set.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character_set.cpp; sourceTree = ""; }; + 12130604182C348F00FCF928 /* character_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = character_set.h; sourceTree = ""; }; 12130607182C374800FCF928 /* rule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rule.h; sourceTree = ""; }; 12130609182C389100FCF928 /* symbol.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = symbol.cpp; sourceTree = ""; }; 1213060A182C389100FCF928 /* symbol.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = symbol.h; sourceTree = ""; }; @@ -99,6 +100,7 @@ 1251209A1830145300C9B56A /* rule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rule.cpp; sourceTree = ""; }; 125120A218307FFD00C9B56A /* test_grammars.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = test_grammars.h; path = spec/fixtures/grammars/test_grammars.h; sourceTree = SOURCE_ROOT; }; 125120A3183083BD00C9B56A /* arithmetic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arithmetic.cpp; path = spec/fixtures/grammars/arithmetic.cpp; sourceTree = SOURCE_ROOT; }; + 12661BF318A1505A00A259FB /* character_set_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character_set_spec.cpp; sourceTree = SOURCE_ROOT; }; 12AB465D188BD03E00DE79DF /* follow_sets.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = follow_sets.cpp; sourceTree = ""; }; 12AB465E188BD03E00DE79DF /* follow_sets.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = follow_sets.h; sourceTree = ""; }; 12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = item_set_closure_spec.cpp; sourceTree = ""; }; @@ -189,8 +191,8 @@ children = ( 1213060F182C3A1100FCF928 /* blank.cpp */, 12130610182C3A1100FCF928 /* blank.h */, - 12130603182C348F00FCF928 /* character.cpp */, - 12130604182C348F00FCF928 /* character.h */, + 12130603182C348F00FCF928 /* character_set.cpp */, + 12130604182C348F00FCF928 /* character_set.h */, 1213060C182C398300FCF928 /* choice.cpp */, 1213060D182C398300FCF928 /* choice.h */, 27A340F3EEB184C040521323 /* pattern.cpp */, @@ -269,6 +271,7 @@ children = ( 121492EA181E200B008E9BDA /* rules_spec.cpp */, 12D136A0183570F5005F3369 /* pattern_spec.cpp */, + 12661BF318A1505A00A259FB /* character_set_spec.cpp */, ); name = rules; path = compiler/rules; @@ -506,6 +509,7 @@ 12FD40F7186A16020041A84E /* lex_table.cpp in Sources */, 12AB4661188CB3A300DE79DF /* item_set_closure_spec.cpp in Sources */, 12FD40E918641FB70041A84E /* rules.cpp in Sources */, + 12661BF418A1505A00A259FB /* character_set_spec.cpp in Sources */, 12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */, 12E75A971891BD32001B8F10 /* json.cpp in Sources */, 12FD4061185E68470041A84E /* c_code.cpp in Sources */, @@ -522,7 +526,7 @@ 12FD40E718639B910041A84E /* visitor.cpp in Sources */, 12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */, 12EDCFBC188205BF005A7A07 /* rule_transitions_spec.cpp in Sources */, - 12130605182C348F00FCF928 /* character.cpp in Sources */, + 12130605182C348F00FCF928 /* character_set.cpp in Sources */, 12EDCFB418820519005A7A07 /* compile.cpp in Sources */, 12BC470718830BC5005AC502 /* first_set_spec.cpp in Sources */, 1213060B182C389100FCF928 /* symbol.cpp in Sources */,