From 7f62e752beb698c6a0041edc3c92f680e02db37f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 Jan 2014 19:18:21 -0800 Subject: [PATCH] Allow Character rules to handle arbitrary character sets --- spec/compiler/build_tables/perform_spec.cpp | 20 +-- spec/fixtures/parsers/arithmetic.c | 110 ++++++------ spec/fixtures/parsers/json.c | 176 ++++++++++---------- src/compiler/build_tables/perform.cpp | 2 +- src/compiler/char_match.cpp | 62 ------- src/compiler/char_match.h | 52 ------ src/compiler/generate_code/c_code.cpp | 43 +++-- src/compiler/lex_table.cpp | 6 +- src/compiler/lex_table.h | 8 +- src/compiler/rules/character.cpp | 64 ++++++- src/compiler/rules/character.h | 63 ++++++- src/compiler/rules/rules.cpp | 6 +- src/compiler/rules/rules.h | 6 +- src/compiler/rules/symbol.cpp | 1 + todo.md | 6 +- tree_sitter.xcodeproj/project.pbxproj | 6 - 16 files changed, 322 insertions(+), 309 deletions(-) delete mode 100644 src/compiler/char_match.cpp delete mode 100644 src/compiler/char_match.h diff --git a/spec/compiler/build_tables/perform_spec.cpp b/spec/compiler/build_tables/perform_spec.cpp index 3d6256e0..d45345de 100644 --- a/spec/compiler/build_tables/perform_spec.cpp +++ b/spec/compiler/build_tables/perform_spec.cpp @@ -16,8 +16,8 @@ static unordered_set keys(const unordered_map &ma return result; } -static unordered_set keys(const unordered_map &map) { - unordered_set result; +static unordered_set keys(const unordered_map &map) { + unordered_set result; for (auto pair : map) { result.insert(pair.first); } @@ -79,16 +79,16 @@ describe("building parse and lex tables", []() { Symbol("left-paren"), }))); - AssertThat(keys(lex_state(0).actions), Equals(unordered_set({ - CharMatchSpecific('('), - CharMatchClass(CharClassDigit), - CharMatchClass(CharClassWord), + AssertThat(keys(lex_state(0).actions), Equals(unordered_set({ + Character('('), + Character(CharClassDigit), + Character(CharClassWord), }))); - AssertThat(lex_state(0).expected_inputs(), Equals(unordered_set({ - CharMatchSpecific('('), - CharMatchClass(CharClassDigit), - CharMatchClass(CharClassWord), + AssertThat(lex_state(0).expected_inputs(), Equals(unordered_set({ + Character('('), + Character(CharClassDigit), + Character(CharClassWord), }))); }); diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 01ae8655..ec8f267e 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -3,28 +3,28 @@ enum ts_symbol { ts_symbol_factor, - ts_aux_token1, ts_aux_token2, - ts_symbol_number, ts_symbol_times, - ts_symbol___END__, + ts_aux_token1, + ts_symbol_variable, ts_symbol_term, ts_symbol_plus, ts_symbol_expression, - ts_symbol_variable, + ts_symbol_number, + ts_symbol___END__, }; static const char *ts_symbol_names[] = { "factor", - "token1", "token2", - "number", "times", - "__END__", + "token1", + "variable", "term", "plus", "expression", - "variable", + "number", + "__END__", }; static void ts_lex(TSParser *parser) { @@ -33,7 +33,7 @@ static void ts_lex(TSParser *parser) { case 0: if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); - LEX_ERROR(1, EXPECT({"''"})); + LEX_ERROR(1, EXPECT({""})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: @@ -41,7 +41,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); - LEX_ERROR(2, EXPECT({"''", "'*'"})); + LEX_ERROR(2, EXPECT({"'*'", ""})); case 3: ACCEPT_TOKEN(ts_symbol_times); case 4: @@ -55,7 +55,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); - LEX_ERROR(2, EXPECT({"'*'", "')'"})); + LEX_ERROR(2, EXPECT({"')'", "'*'"})); case 7: if (LOOKAHEAD_CHAR() == ')') ADVANCE(5); @@ -63,7 +63,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); - LEX_ERROR(3, EXPECT({"'+'", "'*'", "')'"})); + LEX_ERROR(3, EXPECT({"')'", "'*'", "'+'"})); case 8: ACCEPT_TOKEN(ts_symbol_plus); case 9: @@ -71,15 +71,15 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if (LOOKAHEAD_CHAR() == '+') ADVANCE(8); - LEX_ERROR(2, EXPECT({"'+'", "')'"})); + LEX_ERROR(2, EXPECT({"')'", "'+'"})); case 10: - if (isalnum(LOOKAHEAD_CHAR())) - ADVANCE(13); if (LOOKAHEAD_CHAR() == '(') ADVANCE(12); + if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(13); if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); - LEX_ERROR(3, EXPECT({"", "'('", ""})); + LEX_ERROR(3, EXPECT({"", "'('", ""})); case 11: if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); @@ -95,7 +95,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); - LEX_ERROR(2, EXPECT({"''", "'+'"})); + LEX_ERROR(2, EXPECT({"'+'", ""})); case 15: if (LOOKAHEAD_CHAR() == '*') ADVANCE(3); @@ -103,7 +103,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); - LEX_ERROR(3, EXPECT({"''", "'+'", "'*'"})); + LEX_ERROR(3, EXPECT({"'*'", "'+'", ""})); default: LEX_PANIC(); } @@ -118,18 +118,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(52); - case ts_symbol_number: - SHIFT(48); case ts_symbol_variable: SHIFT(47); - case ts_symbol_term: - SHIFT(2); case ts_aux_token1: SHIFT(49); + case ts_symbol_number: + SHIFT(48); + case ts_symbol_term: + SHIFT(2); case ts_symbol_expression: SHIFT(1); default: - PARSE_ERROR(6, EXPECT({"expression", "token1", "term", "variable", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"expression", "variable", "token1", "term", "number", "factor"})); } case 1: SET_LEX_STATE(0); @@ -198,14 +198,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(37); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(37); case ts_symbol_term: SHIFT(8); default: @@ -270,14 +270,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(27); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(27); case ts_symbol_term: SHIFT(8); default: @@ -312,14 +312,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(17); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(17); case ts_symbol_term: SHIFT(8); default: @@ -396,14 +396,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(24); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(24); case ts_symbol_term: SHIFT(8); default: @@ -500,14 +500,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(34); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(34); case ts_symbol_term: SHIFT(8); default: @@ -600,14 +600,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(44); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(44); case ts_symbol_term: SHIFT(8); default: @@ -666,14 +666,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(50); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(50); case ts_symbol_term: SHIFT(8); default: @@ -750,14 +750,14 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_expression: - SHIFT(57); + case ts_symbol_variable: + SHIFT(14); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(57); case ts_symbol_term: SHIFT(8); default: diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index 6b583e9b..f22b7830 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -2,38 +2,38 @@ #include enum ts_symbol { - ts_symbol_number, - ts_symbol_string, - ts_aux_repeat_helper1, - ts_aux_token7, ts_symbol_array, - ts_aux_token4, - ts_aux_token5, - ts_aux_token3, - ts_aux_token2, - ts_aux_token1, - ts_aux_repeat_helper2, ts_aux_token6, + ts_aux_repeat_helper2, + ts_aux_token5, + ts_symbol_string, ts_symbol_value, ts_symbol_object, + ts_aux_token4, + ts_aux_token7, + ts_symbol_number, + ts_aux_token2, + ts_aux_token3, + ts_aux_token1, + ts_aux_repeat_helper1, ts_symbol___END__, }; static const char *ts_symbol_names[] = { - "number", - "string", - "repeat_helper1", - "token7", "array", - "token4", - "token5", - "token3", - "token2", - "token1", - "repeat_helper2", "token6", + "repeat_helper2", + "token5", + "string", "value", "object", + "token4", + "token7", + "number", + "token2", + "token3", + "token1", + "repeat_helper1", "__END__", }; @@ -43,7 +43,7 @@ static void ts_lex(TSParser *parser) { case 0: if (LOOKAHEAD_CHAR() == '\0') ADVANCE(1); - LEX_ERROR(1, EXPECT({"''"})); + LEX_ERROR(1, EXPECT({""})); case 1: ACCEPT_TOKEN(ts_symbol___END__); case 2: @@ -63,7 +63,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(5); if (LOOKAHEAD_CHAR() == ',') ADVANCE(3); - LEX_ERROR(2, EXPECT({"','", "']'"})); + LEX_ERROR(2, EXPECT({"']'", "','"})); case 7: if (LOOKAHEAD_CHAR() == '}') ADVANCE(8); @@ -75,17 +75,17 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); if (LOOKAHEAD_CHAR() == ',') ADVANCE(3); - LEX_ERROR(2, EXPECT({"','", "'}'"})); + LEX_ERROR(2, EXPECT({"'}'", "','"})); case 10: - if (LOOKAHEAD_CHAR() == '{') - ADVANCE(16); if (LOOKAHEAD_CHAR() == '[') ADVANCE(15); if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(16); if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); - LEX_ERROR(4, EXPECT({"", "'\"'", "'['", "'{'"})); + LEX_ERROR(4, EXPECT({"'['", "'\"'", "'{'", ""})); case 11: if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); @@ -99,7 +99,7 @@ static void ts_lex(TSParser *parser) { ADVANCE(14); if (isalnum(LOOKAHEAD_CHAR())) ADVANCE(13); - LEX_ERROR(2, EXPECT({"", "'\"'"})); + LEX_ERROR(2, EXPECT({"'\"'", ""})); case 14: ACCEPT_TOKEN(ts_symbol_string); case 15: @@ -130,20 +130,20 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(81); - case ts_aux_token5: - SHIFT(70); case ts_symbol_array: SHIFT(79); case ts_symbol_object: SHIFT(78); - case ts_aux_token1: - SHIFT(2); + case ts_aux_token5: + SHIFT(70); case ts_symbol_string: SHIFT(80); + case ts_aux_token1: + SHIFT(2); case ts_symbol_value: SHIFT(1); default: - PARSE_ERROR(7, EXPECT({"value", "string", "token1", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"value", "token1", "string", "token5", "object", "array", "number"})); } case 1: SET_LEX_STATE(0); @@ -158,40 +158,40 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); + case ts_aux_token5: + SHIFT(16); case ts_symbol_value: SHIFT(65); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "value", "token5", "object", "array", "number"})); } case 3: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); + case ts_aux_token5: + SHIFT(16); case ts_symbol_value: SHIFT(4); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "value", "token5", "object", "array", "number"})); } case 4: SET_LEX_STATE(2); @@ -246,40 +246,40 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(64); - case ts_aux_token5: - SHIFT(51); case ts_symbol_array: SHIFT(62); case ts_symbol_object: SHIFT(61); - case ts_symbol_string: - SHIFT(63); case ts_symbol_value: SHIFT(59); + case ts_aux_token5: + SHIFT(51); + case ts_symbol_string: + SHIFT(63); case ts_aux_token1: SHIFT(10); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 10: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); + case ts_aux_token5: + SHIFT(16); case ts_symbol_value: SHIFT(11); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "value", "token5", "object", "array", "number"})); } case 11: SET_LEX_STATE(2); @@ -350,20 +350,20 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); case ts_symbol_value: SHIFT(19); + case ts_aux_token5: + SHIFT(16); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 19: SET_LEX_STATE(2); @@ -434,40 +434,40 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(50); - case ts_aux_token5: - SHIFT(37); case ts_symbol_array: SHIFT(48); case ts_symbol_object: SHIFT(47); - case ts_symbol_string: - SHIFT(49); case ts_symbol_value: SHIFT(45); + case ts_aux_token5: + SHIFT(37); + case ts_symbol_string: + SHIFT(49); case ts_aux_token1: SHIFT(27); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 27: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); + case ts_aux_token5: + SHIFT(16); case ts_symbol_value: SHIFT(28); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "value", "token5", "object", "array", "number"})); } case 28: SET_LEX_STATE(2); @@ -578,20 +578,20 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); case ts_symbol_value: SHIFT(40); + case ts_aux_token5: + SHIFT(16); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 40: SET_LEX_STATE(2); @@ -644,14 +644,14 @@ static TSParseResult ts_parse(const char *input) { case 45: SET_LEX_STATE(9); switch (LOOKAHEAD_SYM()) { - case ts_aux_token2: - SHIFT(24); case ts_aux_token7: REDUCE(ts_aux_repeat_helper1, 4, COLLAPSE({1, 0, 1, 0})); + case ts_aux_token2: + SHIFT(24); case ts_aux_repeat_helper1: SHIFT(46); default: - PARSE_ERROR(3, EXPECT({"repeat_helper1", "token7", "token2"})); + PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token7"})); } case 46: SET_LEX_STATE(7); @@ -722,20 +722,20 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); case ts_symbol_value: SHIFT(54); + case ts_aux_token5: + SHIFT(16); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 54: SET_LEX_STATE(2); @@ -788,14 +788,14 @@ static TSParseResult ts_parse(const char *input) { case 59: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_aux_token4: - REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_token2: SHIFT(9); + case ts_aux_token4: + REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); case ts_aux_repeat_helper2: SHIFT(60); default: - PARSE_ERROR(3, EXPECT({"repeat_helper2", "token2", "token4"})); + PARSE_ERROR(3, EXPECT({"repeat_helper2", "token4", "token2"})); } case 60: SET_LEX_STATE(4); @@ -910,20 +910,20 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_number: SHIFT(36); - case ts_aux_token5: - SHIFT(16); case ts_symbol_array: SHIFT(34); case ts_symbol_object: SHIFT(33); - case ts_symbol_string: - SHIFT(35); case ts_symbol_value: SHIFT(73); + case ts_aux_token5: + SHIFT(16); + case ts_symbol_string: + SHIFT(35); case ts_aux_token1: SHIFT(3); default: - PARSE_ERROR(7, EXPECT({"token1", "value", "string", "object", "array", "token5", "number"})); + PARSE_ERROR(7, EXPECT({"token1", "string", "token5", "value", "object", "array", "number"})); } case 73: SET_LEX_STATE(2); diff --git a/src/compiler/build_tables/perform.cpp b/src/compiler/build_tables/perform.cpp index ca323e33..61e29ffc 100644 --- a/src/compiler/build_tables/perform.cpp +++ b/src/compiler/build_tables/perform.cpp @@ -49,7 +49,7 @@ namespace tree_sitter { rules::Character rule = *transition.first; LexItemSet item_set = *transition.second; size_t new_state_index = add_lex_state(item_set); - lex_table.add_action(state_index, rule.value, LexAction::Advance(new_state_index)); + lex_table.add_action(state_index, rule, LexAction::Advance(new_state_index)); } } diff --git a/src/compiler/char_match.cpp b/src/compiler/char_match.cpp deleted file mode 100644 index fe9b5f47..00000000 --- a/src/compiler/char_match.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "char_match.h" - -using std::string; - -namespace tree_sitter { - CharMatch CharMatchSpecific(char value) { - CharMatch result = { .type = CharMatchTypeSpecific }; - result.value.character = value; - return result; - } - - CharMatch CharMatchClass(CharClass value) { - CharMatch result = { .type = CharMatchTypeClass }; - result.value.character = value; - return result; - } - - CharMatch CharMatchRange(char min, char max) { - CharMatch result = { .type = CharMatchTypeRange }; - result.value.range.min_character = min; - result.value.range.max_character = max; - return result; - } - - string CharMatchToString(CharMatch match) { - switch (match.type) { - case CharMatchTypeClass: - switch (match.value.character_class) { - case CharClassDigit: - return ""; - case CharClassWord: - return ""; - } - case CharMatchTypeSpecific: - return string("'") + string(&match.value.character) + "'"; - case CharMatchTypeRange: - return ( - string("'") + - string(&match.value.range.min_character) + "'-'" + - string(&match.value.range.max_character) + "'"); - } - } - - bool operator==(const CharMatch &left, const CharMatch &right) { - if (left.type != right.type) - return false; - switch (left.type) { - case CharMatchTypeClass: - return (left.value.character_class == right.value.character_class); - case CharMatchTypeSpecific: - return (left.value.character == right.value.character); - case CharMatchTypeRange: - return ( - left.value.range.min_character == right.value.range.min_character && - left.value.range.max_character == right.value.range.max_character); - } - } - - std::ostream& operator<<(std::ostream& stream, const CharMatch &match) { - return stream << CharMatchToString(match); - } -} \ No newline at end of file diff --git a/src/compiler/char_match.h b/src/compiler/char_match.h deleted file mode 100644 index 4780d4c6..00000000 --- a/src/compiler/char_match.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef __TreeSitter__char_match__ -#define __TreeSitter__char_match__ - -#include -#include - -namespace tree_sitter { - typedef enum { - CharMatchTypeSpecific, - CharMatchTypeClass, - CharMatchTypeRange, - } CharMatchType; - - typedef enum { - CharClassWord, - CharClassDigit - } CharClass; - - struct CharMatch { - CharMatchType type; - union { - CharClass character_class; - char character; - struct { - char min_character; - char max_character; - } range; - } value; - }; - - CharMatch CharMatchSpecific(char); - CharMatch CharMatchClass(CharClass); - CharMatch CharMatchRange(char, char); - std::string CharMatchToString(CharMatch); - - bool operator==(const CharMatch &, const CharMatch &); - std::ostream& operator<<(std::ostream& stream, const CharMatch &rule); -} - -namespace std { - template<> - struct hash { - size_t operator()(const tree_sitter::CharMatch &match) const { - return ( - hash()(match.type) ^ - hash()(match.value.range.min_character) ^ - hash()(match.value.range.max_character)); - } - }; -} - -#endif diff --git a/src/compiler/generate_code/c_code.cpp b/src/compiler/generate_code/c_code.cpp index b19dc33a..b1fce081 100644 --- a/src/compiler/generate_code/c_code.cpp +++ b/src/compiler/generate_code/c_code.cpp @@ -101,23 +101,33 @@ namespace tree_sitter { } } - string condition_for_char_match(const CharMatch &char_match) { + string condition_for_character_match(const rules::CharacterMatch &match) { auto value = "LOOKAHEAD_CHAR()"; - switch (char_match.type) { - case CharMatchTypeClass: - switch (char_match.value.character_class) { - case CharClassDigit: + switch (match.type) { + case rules::CharacterMatchTypeClass: + switch (match.value.character_class) { + case rules::CharClassDigit: return string("isdigit(") + value + ")"; - case CharClassWord: + case rules::CharClassWord: return string("isalnum(") + value + ")"; } - case CharMatchTypeSpecific: - return string(value) + " == '" + character_code(char_match.value.character) + "'"; + case rules::CharacterMatchTypeSpecific: + return string(value) + " == '" + character_code(match.value.character) + "'"; default: return ""; } } + string condition_for_character_rule(const rules::Character &rule) { + vector parts; + for (auto &match : rule.matches) { + parts.push_back(condition_for_character_match(match)); + } + string result = join(parts, " || "); + if (!rule.sign) result = "!(" + result + ")"; + return result; + } + string collapse_flags(vector flags) { string result; bool started = false; @@ -164,19 +174,24 @@ namespace tree_sitter { return input; } - string lex_error_call(const unordered_set &expected_inputs) { - string result = "LEX_ERROR(" + to_string(expected_inputs.size()) + ", EXPECT({"; + string lex_error_call(const unordered_set &expected_inputs) { + unordered_set expected_matches; + for (auto &rule : expected_inputs) + for (auto &match : rule.matches) + expected_matches.insert(match); + + string result = "LEX_ERROR(" + to_string(expected_matches.size()) + ", EXPECT({"; bool started = false; - for (auto match : expected_inputs) { + for (auto match : expected_matches) { if (started) result += ", "; started = true; - result += "\"" + escape_string(CharMatchToString(match)) + "\""; + result += "\"" + escape_string(match.to_string()) + "\""; } result += "}));"; return result; } - string code_for_lex_actions(const unordered_set &actions, const unordered_set &expected_inputs) { + string code_for_lex_actions(const unordered_set &actions, const unordered_set &expected_inputs) { auto action = actions.begin(); if (action == actions.end()) { return lex_error_call(expected_inputs); @@ -206,7 +221,7 @@ namespace tree_sitter { string result = ""; auto expected_inputs = parse_state.expected_inputs(); for (auto pair : parse_state.actions) - result += _if(condition_for_char_match(pair.first), code_for_lex_actions(pair.second, expected_inputs)); + result += _if(condition_for_character_rule(pair.first), code_for_lex_actions(pair.second, expected_inputs)); result += code_for_lex_actions(parse_state.default_actions, expected_inputs); return result; } diff --git a/src/compiler/lex_table.cpp b/src/compiler/lex_table.cpp index 5a211c6c..c7e7c8e8 100644 --- a/src/compiler/lex_table.cpp +++ b/src/compiler/lex_table.cpp @@ -45,8 +45,8 @@ namespace tree_sitter { } // State - unordered_set LexState::expected_inputs() const { - unordered_set result; + unordered_set LexState::expected_inputs() const { + unordered_set result; for (auto pair : actions) result.insert(pair.first); return result; @@ -58,7 +58,7 @@ namespace tree_sitter { return states.size() - 1; } - void LexTable::add_action(size_t state_index, CharMatch match, LexAction action) { + void LexTable::add_action(size_t state_index, rules::Character match, LexAction action) { states[state_index].actions[match].insert(action); } diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index 55fe48a2..7350d23d 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -5,8 +5,8 @@ #include #include #include -#include "char_match.h" #include "symbol.h" +#include "character.h" namespace tree_sitter { typedef enum { @@ -45,15 +45,15 @@ namespace std { namespace tree_sitter { class LexState { public: - std::unordered_map> actions; + std::unordered_map> actions; std::unordered_set default_actions; - std::unordered_set expected_inputs() const; + std::unordered_set expected_inputs() const; }; class LexTable { public: size_t add_state(); - void add_action(size_t state_index, CharMatch match, LexAction action); + void add_action(size_t state_index, rules::Character rule, LexAction action); void add_default_action(size_t state_index, LexAction action); std::vector states; diff --git a/src/compiler/rules/character.cpp b/src/compiler/rules/character.cpp index f9435ea0..728623d8 100644 --- a/src/compiler/rules/character.cpp +++ b/src/compiler/rules/character.cpp @@ -5,17 +5,64 @@ using std::hash; namespace tree_sitter { namespace rules { - Character::Character(char value) : value(CharMatchSpecific(value)) {}; - Character::Character(CharClass value) : value(CharMatchClass(value)) {}; - Character::Character(char min, char max) : value(CharMatchRange(min, max)) {}; + CharacterMatch::CharacterMatch(char character) : type(CharacterMatchTypeSpecific) { value.character = character; } + CharacterMatch::CharacterMatch(CharacterClass klass) : type(CharacterMatchTypeClass) { value.character_class = klass; } + CharacterMatch::CharacterMatch(std::pair bounds) : type(CharacterMatchTypeRange) { + value.range.min_character = bounds.first; + value.range.max_character = bounds.second; + } + + Character::Character(char character) : matches({ CharacterMatch(character) }), sign(true) {} + Character::Character(CharacterClass char_class) : matches({ CharacterMatch(char_class) }), sign(true) {} + Character::Character(const std::vector &matches, bool sign) : matches(matches), sign(sign) {} + + bool CharacterMatch::operator==(const CharacterMatch &right) const { + if (type != right.type) + return false; + switch (type) { + case CharacterMatchTypeClass: + return (value.character_class == right.value.character_class); + case CharacterMatchTypeSpecific: + return (value.character == right.value.character); + case CharacterMatchTypeRange: + return (value.range.min_character == right.value.range.min_character && + value.range.max_character == right.value.range.max_character); + } + } + + string CharacterMatch::to_string() const { + switch (type) { + case CharacterMatchTypeClass: + switch (value.character_class) { + case CharClassDigit: + return ""; + case CharClassWord: + return ""; + } + case CharacterMatchTypeSpecific: + return (value.character == '\0') ? + "" : + string("'") + value.character + "'"; + case CharacterMatchTypeRange: + return (string("'") + + value.range.min_character + "'-'" + + value.range.max_character + "'"); + } + } + bool Character::operator==(const Rule &rule) const { const Character *other = dynamic_cast(&rule); - return other && (other->value == value); + if (!other) return false; + auto size = matches.size(); + if (other->matches.size() != size) return false; + for (int i = 0; i < size; i++) + if (!(matches[i] == other->matches[i])) return false; + return true; } - + size_t Character::hash_code() const { - return typeid(this).hash_code() ^ hash()(CharMatchToString(value)); + return typeid(this).hash_code() ^ hash()(to_string()); } rule_ptr Character::copy() const { @@ -23,7 +70,10 @@ namespace tree_sitter { } string Character::to_string() const { - return string("#"; + string prefix("#"; } void Character::accept(Visitor &visitor) const { diff --git a/src/compiler/rules/character.h b/src/compiler/rules/character.h index 9df5583c..d2c5a54d 100644 --- a/src/compiler/rules/character.h +++ b/src/compiler/rules/character.h @@ -2,15 +2,46 @@ #define __tree_sitter__char__ #include "rule.h" -#include "char_match.h" +#include +#include namespace tree_sitter { namespace rules { + typedef enum { + CharClassWord, + CharClassDigit + } CharacterClass; + + typedef enum { + CharacterMatchTypeSpecific, + CharacterMatchTypeClass, + CharacterMatchTypeRange, + } CharacterMatchType; + + struct CharacterMatch { + CharacterMatchType type; + union { + CharacterClass character_class; + char character; + struct { + char min_character; + char max_character; + } range; + } value; + + CharacterMatch(char); + CharacterMatch(std::pair); + CharacterMatch(CharacterClass); + bool operator==(const CharacterMatch &) const; + std::string to_string() const; + }; + class Character : public Rule { public: Character(char character); - Character(CharClass character_class); + Character(CharacterClass character_class); Character(char min_character, char max_character); + Character(const std::vector &matches, bool sign); bool operator==(const Rule& other) const; size_t hash_code() const; @@ -18,9 +49,35 @@ namespace tree_sitter { std::string to_string() const; void accept(Visitor &visitor) const; - const CharMatch value; + std::vector matches; + bool sign; }; } } +namespace std { + template<> + struct hash { + size_t operator()(const tree_sitter::rules::CharacterMatch &match) const { + auto type = match.type; + auto result = hash()(type); + switch (type) { + case tree_sitter::rules::CharacterMatchTypeClass: + result ^= hash()(match.value.character_class); + case tree_sitter::rules::CharacterMatchTypeRange: + result ^= hash()(match.value.range.min_character); + result ^= hash()(match.value.range.max_character); + case tree_sitter::rules::CharacterMatchTypeSpecific: + result ^= hash()(match.value.character); + } + return result; + } + }; +} + +namespace std { + template<> + struct hash : hash {}; +} + #endif diff --git a/src/compiler/rules/rules.cpp b/src/compiler/rules/rules.cpp index de94e54d..2e74670e 100644 --- a/src/compiler/rules/rules.cpp +++ b/src/compiler/rules/rules.cpp @@ -14,9 +14,13 @@ namespace tree_sitter { return make_shared(value); } - rule_ptr character(CharClass value) { + rule_ptr character(CharacterClass value) { return make_shared(value); } + + rule_ptr character(const std::vector &matches, bool is_affirmative) { + return make_shared(matches, is_affirmative); + } rule_ptr choice(const initializer_list &rules) { rule_ptr result; diff --git a/src/compiler/rules/rules.h b/src/compiler/rules/rules.h index 1d8a86bb..2b7cace2 100644 --- a/src/compiler/rules/rules.h +++ b/src/compiler/rules/rules.h @@ -16,8 +16,10 @@ namespace tree_sitter { namespace rules { rule_ptr blank(); rule_ptr character(char value); - rule_ptr character(char min, char max); - rule_ptr character(CharClass value); + rule_ptr character(CharacterClass value); + rule_ptr character(const std::vector &matches); + rule_ptr character(const std::vector &matches, bool); + rule_ptr choice(const std::initializer_list &rules); rule_ptr pattern(const std::string &value); rule_ptr repeat(const rule_ptr content); diff --git a/src/compiler/rules/symbol.cpp b/src/compiler/rules/symbol.cpp index 5ec8811d..7ffd14be 100644 --- a/src/compiler/rules/symbol.cpp +++ b/src/compiler/rules/symbol.cpp @@ -1,4 +1,5 @@ #include "rules.h" +#include using std::string; using std::hash; diff --git a/todo.md b/todo.md index 03267e0d..5f731c3c 100644 --- a/todo.md +++ b/todo.md @@ -14,4 +14,8 @@ TODO ## node.js wrapper - add simple selector engine for trees -## incremental parsing \ No newline at end of file +## incremental parsing + +## chores +- figure out why Symbol and Character can't have const member variables + (unordered_map seems to require mutability of key objects) diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index 4f728673..a9a4eb15 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -53,7 +53,6 @@ 12FD40DF1860064C0041A84E /* tree.c in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40DE1860064C0041A84E /* tree.c */; }; 12FD40E718639B910041A84E /* visitor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40E618639B910041A84E /* visitor.cpp */; }; 12FD40E918641FB70041A84E /* rules.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40E818641FB70041A84E /* rules.cpp */; }; - 12FD40F3186641C00041A84E /* char_match.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F1186641C00041A84E /* char_match.cpp */; }; 12FD40F7186A16020041A84E /* lex_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD40F5186A16020041A84E /* lex_table.cpp */; }; 27A343CA69E17E0F9EBEDF1C /* pattern.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 27A340F3EEB184C040521323 /* pattern.cpp */; }; /* End PBXBuildFile section */ @@ -162,8 +161,6 @@ 12FD40E41862B3530041A84E /* visitor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = visitor.h; sourceTree = ""; }; 12FD40E618639B910041A84E /* visitor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = visitor.cpp; sourceTree = ""; }; 12FD40E818641FB70041A84E /* rules.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rules.cpp; sourceTree = ""; }; - 12FD40F1186641C00041A84E /* char_match.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = char_match.cpp; sourceTree = ""; }; - 12FD40F2186641C00041A84E /* char_match.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = char_match.h; sourceTree = ""; }; 12FD40F5186A16020041A84E /* lex_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lex_table.cpp; sourceTree = ""; }; 27A340F3EEB184C040521323 /* pattern.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pattern.cpp; sourceTree = ""; }; 27A3438C4FA59A3882E8493B /* pattern.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pattern.h; sourceTree = ""; }; @@ -358,8 +355,6 @@ isa = PBXGroup; children = ( 12130618182C84B700FCF928 /* build_tables */, - 12FD40F1186641C00041A84E /* char_match.cpp */, - 12FD40F2186641C00041A84E /* char_match.h */, 12EDCFAC18820181005A7A07 /* compile.cpp */, 12EDCFAD18820181005A7A07 /* compile.h */, 12FD4067185E8AF40041A84E /* generate_code */, @@ -523,7 +518,6 @@ 12D136A4183678A2005F3369 /* repeat.cpp in Sources */, 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */, 12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */, - 12FD40F3186641C00041A84E /* char_match.cpp in Sources */, 12EDCFB21882039A005A7A07 /* perform.cpp in Sources */, 12FD40E718639B910041A84E /* visitor.cpp in Sources */, 12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */,