diff --git a/examples/grammars/json.cc b/examples/grammars/json.cc index 4e2992db..8c6f9284 100644 --- a/examples/grammars/json.cc +++ b/examples/grammars/json.cc @@ -19,7 +19,7 @@ namespace tree_sitter_examples { str(":"), sym("value") })))) }, { "array", in_brackets(comma_sep(err(sym("value")))) }, - { "string", pattern("\"([^\"]|\\\\\")+\"") }, + { "string", pattern("\"([^\"]|\\\\\")*\"") }, { "number", pattern("\\d+(\\.\\d+)?") }, { "null", keyword("null") }, { "true", keyword("true") }, diff --git a/examples/parsers/arithmetic.c b/examples/parsers/arithmetic.c index d73d4478..6008f2b4 100644 --- a/examples/parsers/arithmetic.c +++ b/examples/parsers/arithmetic.c @@ -34,13 +34,13 @@ SYMBOL_NAMES = { [ts_builtin_sym_end] = "end", [ts_sym_number] = "number", [ts_sym_variable] = "variable", - [ts_aux_sym_token0] = "'+'", - [ts_aux_sym_token1] = "'-'", - [ts_aux_sym_token2] = "'*'", - [ts_aux_sym_token3] = "'/'", - [ts_aux_sym_token4] = "'^'", - [ts_aux_sym_token5] = "'('", - [ts_aux_sym_token6] = "')'", + [ts_aux_sym_token0] = "", + [ts_aux_sym_token1] = "", + [ts_aux_sym_token2] = "", + [ts_aux_sym_token3] = "", + [ts_aux_sym_token4] = "", + [ts_aux_sym_token5] = "", + [ts_aux_sym_token6] = "", }; UBIQUITOUS_SYMBOLS = { diff --git a/examples/parsers/golang.c b/examples/parsers/golang.c index 2c2137bf..aa2503f2 100644 --- a/examples/parsers/golang.c +++ b/examples/parsers/golang.c @@ -109,34 +109,34 @@ SYMBOL_NAMES = { [ts_aux_sym__func_signature_repeat2] = "_func_signature_repeat2", [ts_aux_sym__func_signature_repeat3] = "_func_signature_repeat3", [ts_aux_sym__func_signature_repeat4] = "_func_signature_repeat4", - [ts_aux_sym_token0] = "'package'", - [ts_aux_sym_token1] = "'import'", - [ts_aux_sym_token2] = "'('", - [ts_aux_sym_token3] = "')'", - [ts_aux_sym_token4] = "'type'", - [ts_aux_sym_token5] = "'var'", - [ts_aux_sym_token6] = "'='", - [ts_aux_sym_token7] = "'func'", - [ts_aux_sym_token8] = "'{'", - [ts_aux_sym_token9] = "'}'", - [ts_aux_sym_token10] = "'*'", - [ts_aux_sym_token11] = "'map'", - [ts_aux_sym_token12] = "'['", - [ts_aux_sym_token13] = "']'", - [ts_aux_sym_token14] = "'struct'", - [ts_aux_sym_token15] = "'interface'", - [ts_aux_sym_token16] = "'/'", - [ts_aux_sym_token17] = "'+'", - [ts_aux_sym_token18] = "'-'", - [ts_aux_sym_token19] = "'||'", - [ts_aux_sym_token20] = "'&&'", - [ts_aux_sym_token21] = "'=='", - [ts_aux_sym_token22] = "'<='", - [ts_aux_sym_token23] = "'<'", - [ts_aux_sym_token24] = "'>='", - [ts_aux_sym_token25] = "'>'", - [ts_aux_sym_token26] = "'!'", - [ts_aux_sym_token27] = "','", + [ts_aux_sym_token0] = "", + [ts_aux_sym_token1] = "", + [ts_aux_sym_token2] = "", + [ts_aux_sym_token3] = "", + [ts_aux_sym_token4] = "", + [ts_aux_sym_token5] = "", + [ts_aux_sym_token6] = "", + [ts_aux_sym_token7] = "", + [ts_aux_sym_token8] = "", + [ts_aux_sym_token9] = "", + [ts_aux_sym_token10] = "", + [ts_aux_sym_token11] = "", + [ts_aux_sym_token12] = "", + [ts_aux_sym_token13] = "", + [ts_aux_sym_token14] = "", + [ts_aux_sym_token15] = "", + [ts_aux_sym_token16] = "", + [ts_aux_sym_token17] = "", + [ts_aux_sym_token18] = "", + [ts_aux_sym_token19] = "", + [ts_aux_sym_token20] = "", + [ts_aux_sym_token21] = "", + [ts_aux_sym_token22] = "", + [ts_aux_sym_token23] = "", + [ts_aux_sym_token24] = "", + [ts_aux_sym_token25] = "", + [ts_aux_sym_token26] = "", + [ts_aux_sym_token27] = "", }; UBIQUITOUS_SYMBOLS = { diff --git a/examples/parsers/javascript.c b/examples/parsers/javascript.c index b6199822..dd1ef335 100644 --- a/examples/parsers/javascript.c +++ b/examples/parsers/javascript.c @@ -143,52 +143,52 @@ SYMBOL_NAMES = { [ts_aux_sym_formal_parameters_repeat0] = "formal_parameters_repeat0", [ts_aux_sym_object_repeat0] = "object_repeat0", [ts_aux_sym_array_repeat0] = "array_repeat0", - [ts_aux_sym_token0] = "'{'", - [ts_aux_sym_token1] = "'}'", - [ts_aux_sym_token2] = "'for'", - [ts_aux_sym_token3] = "'('", - [ts_aux_sym_token4] = "')'", - [ts_aux_sym_token5] = "'if'", - [ts_aux_sym_token6] = "'else'", - [ts_aux_sym_token7] = "'while'", - [ts_aux_sym_token8] = "'try'", - [ts_aux_sym_token9] = "'catch'", - [ts_aux_sym_token10] = "'switch'", - [ts_aux_sym_token11] = "'case'", - [ts_aux_sym_token12] = "'default'", - [ts_aux_sym_token13] = "':'", - [ts_aux_sym_token14] = "'break'", - [ts_aux_sym_token15] = "'var'", - [ts_aux_sym_token16] = "','", - [ts_aux_sym_token17] = "'return'", - [ts_aux_sym_token18] = "'delete'", - [ts_aux_sym_token19] = "'++'", - [ts_aux_sym_token20] = "'--'", - [ts_aux_sym_token21] = "'+'", - [ts_aux_sym_token22] = "'-'", - [ts_aux_sym_token23] = "'*'", - [ts_aux_sym_token24] = "'/'", - [ts_aux_sym_token25] = "'&'", - [ts_aux_sym_token26] = "'|'", - [ts_aux_sym_token27] = "'^'", - [ts_aux_sym_token28] = "'||'", - [ts_aux_sym_token29] = "'&&'", - [ts_aux_sym_token30] = "'==='", - [ts_aux_sym_token31] = "'=='", - [ts_aux_sym_token32] = "'!=='", - [ts_aux_sym_token33] = "'!='", - [ts_aux_sym_token34] = "'<='", - [ts_aux_sym_token35] = "'<'", - [ts_aux_sym_token36] = "'>='", - [ts_aux_sym_token37] = "'>'", - [ts_aux_sym_token38] = "'!'", - [ts_aux_sym_token39] = "'?'", - [ts_aux_sym_token40] = "'='", - [ts_aux_sym_token41] = "'function'", - [ts_aux_sym_token42] = "'new'", - [ts_aux_sym_token43] = "'.'", - [ts_aux_sym_token44] = "'['", - [ts_aux_sym_token45] = "']'", + [ts_aux_sym_token0] = "", + [ts_aux_sym_token1] = "", + [ts_aux_sym_token2] = "", + [ts_aux_sym_token3] = "", + [ts_aux_sym_token4] = "", + [ts_aux_sym_token5] = "", + [ts_aux_sym_token6] = "", + [ts_aux_sym_token7] = "", + [ts_aux_sym_token8] = "", + [ts_aux_sym_token9] = "", + [ts_aux_sym_token10] = "", + [ts_aux_sym_token11] = "", + [ts_aux_sym_token12] = "", + [ts_aux_sym_token13] = "", + [ts_aux_sym_token14] = "", + [ts_aux_sym_token15] = "", + [ts_aux_sym_token16] = "", + [ts_aux_sym_token17] = "", + [ts_aux_sym_token18] = "", + [ts_aux_sym_token19] = "", + [ts_aux_sym_token20] = "", + [ts_aux_sym_token21] = "", + [ts_aux_sym_token22] = "", + [ts_aux_sym_token23] = "", + [ts_aux_sym_token24] = "", + [ts_aux_sym_token25] = "", + [ts_aux_sym_token26] = "", + [ts_aux_sym_token27] = "", + [ts_aux_sym_token28] = "", + [ts_aux_sym_token29] = "", + [ts_aux_sym_token30] = "", + [ts_aux_sym_token31] = "", + [ts_aux_sym_token32] = "", + [ts_aux_sym_token33] = "", + [ts_aux_sym_token34] = "", + [ts_aux_sym_token35] = "", + [ts_aux_sym_token36] = "", + [ts_aux_sym_token37] = "", + [ts_aux_sym_token38] = "", + [ts_aux_sym_token39] = "", + [ts_aux_sym_token40] = "", + [ts_aux_sym_token41] = "", + [ts_aux_sym_token42] = "", + [ts_aux_sym_token43] = "", + [ts_aux_sym_token44] = "", + [ts_aux_sym_token45] = "", }; UBIQUITOUS_SYMBOLS = { diff --git a/examples/parsers/json.c b/examples/parsers/json.c index 0060609d..6f7fb83a 100644 --- a/examples/parsers/json.c +++ b/examples/parsers/json.c @@ -35,12 +35,12 @@ SYMBOL_NAMES = { [ts_sym_false] = "false", [ts_aux_sym_object_repeat0] = "object_repeat0", [ts_aux_sym_array_repeat0] = "array_repeat0", - [ts_aux_sym_token0] = "'{'", - [ts_aux_sym_token1] = "':'", - [ts_aux_sym_token2] = "','", - [ts_aux_sym_token3] = "'}'", - [ts_aux_sym_token4] = "'['", - [ts_aux_sym_token5] = "']'", + [ts_aux_sym_token0] = "", + [ts_aux_sym_token1] = "", + [ts_aux_sym_token2] = "", + [ts_aux_sym_token3] = "", + [ts_aux_sym_token4] = "", + [ts_aux_sym_token5] = "", }; UBIQUITOUS_SYMBOLS = { @@ -69,90 +69,87 @@ LEX_FN() { if (lookahead == '\"') ADVANCE(2); if ('0' <= lookahead && lookahead <= '9') - ADVANCE(7); + ADVANCE(6); if (lookahead == '[') - ADVANCE(10); + ADVANCE(9); if (lookahead == 'f') - ADVANCE(11); + ADVANCE(10); if (lookahead == 'n') - ADVANCE(16); + ADVANCE(15); if (lookahead == 't') - ADVANCE(20); + ADVANCE(19); if (lookahead == '{') - ADVANCE(24); + ADVANCE(23); LEX_ERROR(); case 2: if (!((lookahead == '\"') || (lookahead == '\\'))) + ADVANCE(2); + if (lookahead == '\"') ADVANCE(3); if (lookahead == '\\') - ADVANCE(5); + ADVANCE(4); LEX_ERROR(); case 3: + ACCEPT_TOKEN(ts_sym_string); + case 4: if (!((lookahead == '\"') || (lookahead == '\\'))) - ADVANCE(3); + ADVANCE(2); if (lookahead == '\"') - ADVANCE(4); - if (lookahead == '\\') ADVANCE(5); + if (lookahead == '\\') + ADVANCE(4); LEX_ERROR(); - case 4: - ACCEPT_TOKEN(ts_sym_string); case 5: if (!((lookahead == '\"') || (lookahead == '\\'))) - ADVANCE(3); + ADVANCE(2); if (lookahead == '\"') - ADVANCE(6); + ADVANCE(3); if (lookahead == '\\') - ADVANCE(5); - LEX_ERROR(); - case 6: - if (!((lookahead == '\"') || - (lookahead == '\\'))) - ADVANCE(3); - if (lookahead == '\"') ADVANCE(4); - if (lookahead == '\\') - ADVANCE(5); ACCEPT_TOKEN(ts_sym_string); - case 7: + case 6: if (lookahead == '.') - ADVANCE(8); - if ('0' <= lookahead && lookahead <= '9') ADVANCE(7); + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(6); ACCEPT_TOKEN(ts_sym_number); + case 7: + if ('0' <= lookahead && lookahead <= '9') + ADVANCE(8); + LEX_ERROR(); case 8: if ('0' <= lookahead && lookahead <= '9') - ADVANCE(9); - LEX_ERROR(); - case 9: - if ('0' <= lookahead && lookahead <= '9') - ADVANCE(9); + ADVANCE(8); ACCEPT_TOKEN(ts_sym_number); - case 10: + case 9: ACCEPT_TOKEN(ts_aux_sym_token4); - case 11: + case 10: if (lookahead == 'a') + ADVANCE(11); + LEX_ERROR(); + case 11: + if (lookahead == 'l') ADVANCE(12); LEX_ERROR(); case 12: - if (lookahead == 'l') + if (lookahead == 's') ADVANCE(13); LEX_ERROR(); case 13: - if (lookahead == 's') + if (lookahead == 'e') ADVANCE(14); LEX_ERROR(); case 14: - if (lookahead == 'e') - ADVANCE(15); - LEX_ERROR(); - case 15: ACCEPT_TOKEN(ts_sym_false); - case 16: + case 15: if (lookahead == 'u') + ADVANCE(16); + LEX_ERROR(); + case 16: + if (lookahead == 'l') ADVANCE(17); LEX_ERROR(); case 17: @@ -160,65 +157,71 @@ LEX_FN() { ADVANCE(18); LEX_ERROR(); case 18: - if (lookahead == 'l') - ADVANCE(19); - LEX_ERROR(); - case 19: ACCEPT_TOKEN(ts_sym_null); - case 20: + case 19: if (lookahead == 'r') + ADVANCE(20); + LEX_ERROR(); + case 20: + if (lookahead == 'u') ADVANCE(21); LEX_ERROR(); case 21: - if (lookahead == 'u') + if (lookahead == 'e') ADVANCE(22); LEX_ERROR(); case 22: - if (lookahead == 'e') - ADVANCE(23); - LEX_ERROR(); - case 23: ACCEPT_TOKEN(ts_sym_true); - case 24: + case 23: ACCEPT_TOKEN(ts_aux_sym_token0); - case 25: + case 24: START_TOKEN(); if (lookahead == '\0') - ADVANCE(26); + ADVANCE(25); if ((lookahead == '\t') || (lookahead == '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(25); + ADVANCE(24); LEX_ERROR(); - case 26: + case 25: ACCEPT_TOKEN(ts_builtin_sym_end); - case 27: + case 26: START_TOKEN(); if (('\t' <= lookahead && lookahead <= '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(27); + ADVANCE(26); if (lookahead == '\"') ADVANCE(2); if (lookahead == '}') - ADVANCE(28); + ADVANCE(27); LEX_ERROR(); - case 28: + case 27: ACCEPT_TOKEN(ts_aux_sym_token3); - case 29: + case 28: START_TOKEN(); if (('\t' <= lookahead && lookahead <= '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(29); + ADVANCE(28); if (lookahead == ',') + ADVANCE(29); + if (lookahead == '}') + ADVANCE(27); + LEX_ERROR(); + case 29: + ACCEPT_TOKEN(ts_aux_sym_token2); + case 30: + START_TOKEN(); + if ((lookahead == '\t') || + (lookahead == '\n') || + (lookahead == '\r') || + (lookahead == ' ')) ADVANCE(30); if (lookahead == '}') - ADVANCE(28); + ADVANCE(27); LEX_ERROR(); - case 30: - ACCEPT_TOKEN(ts_aux_sym_token2); case 31: START_TOKEN(); if ((lookahead == '\t') || @@ -226,8 +229,8 @@ LEX_FN() { (lookahead == '\r') || (lookahead == ' ')) ADVANCE(31); - if (lookahead == '}') - ADVANCE(28); + if (lookahead == '\"') + ADVANCE(2); LEX_ERROR(); case 32: START_TOKEN(); @@ -236,128 +239,118 @@ LEX_FN() { (lookahead == '\r') || (lookahead == ' ')) ADVANCE(32); - if (lookahead == '\"') - ADVANCE(2); + if (lookahead == ':') + ADVANCE(33); LEX_ERROR(); case 33: - START_TOKEN(); - if ((lookahead == '\t') || - (lookahead == '\n') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(33); - if (lookahead == ':') - ADVANCE(34); - LEX_ERROR(); - case 34: ACCEPT_TOKEN(ts_aux_sym_token1); - case 35: + case 34: START_TOKEN(); if (('\t' <= lookahead && lookahead <= '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(35); + ADVANCE(34); if (lookahead == '\"') ADVANCE(2); if ('0' <= lookahead && lookahead <= '9') - ADVANCE(7); + ADVANCE(6); if (lookahead == '[') - ADVANCE(10); + ADVANCE(9); if (lookahead == ']') - ADVANCE(36); + ADVANCE(35); if (lookahead == 'f') - ADVANCE(11); + ADVANCE(10); if (lookahead == 'n') - ADVANCE(16); + ADVANCE(15); if (lookahead == 't') - ADVANCE(20); + ADVANCE(19); if (lookahead == '{') - ADVANCE(24); + ADVANCE(23); LEX_ERROR(); - case 36: + case 35: ACCEPT_TOKEN(ts_aux_sym_token5); - case 37: + case 36: START_TOKEN(); if (('\t' <= lookahead && lookahead <= '\n') || + (lookahead == '\r') || + (lookahead == ' ')) + ADVANCE(36); + if (lookahead == ',') + ADVANCE(29); + if (lookahead == ']') + ADVANCE(35); + LEX_ERROR(); + case 37: + START_TOKEN(); + if ((lookahead == '\t') || + (lookahead == '\n') || (lookahead == '\r') || (lookahead == ' ')) ADVANCE(37); - if (lookahead == ',') - ADVANCE(30); if (lookahead == ']') - ADVANCE(36); + ADVANCE(35); LEX_ERROR(); case 38: - START_TOKEN(); - if ((lookahead == '\t') || - (lookahead == '\n') || - (lookahead == '\r') || - (lookahead == ' ')) - ADVANCE(38); - if (lookahead == ']') - ADVANCE(36); - LEX_ERROR(); - case 39: START_TOKEN(); if (lookahead == '\0') - ADVANCE(26); + ADVANCE(25); if (('\t' <= lookahead && lookahead <= '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(39); + ADVANCE(38); if (lookahead == '\"') ADVANCE(2); if (lookahead == ',') - ADVANCE(30); + ADVANCE(29); if ('0' <= lookahead && lookahead <= '9') - ADVANCE(7); + ADVANCE(6); if (lookahead == ':') - ADVANCE(34); + ADVANCE(33); if (lookahead == '[') - ADVANCE(10); + ADVANCE(9); if (lookahead == ']') - ADVANCE(36); + ADVANCE(35); if (lookahead == 'f') - ADVANCE(11); + ADVANCE(10); if (lookahead == 'n') - ADVANCE(16); + ADVANCE(15); if (lookahead == 't') - ADVANCE(20); + ADVANCE(19); if (lookahead == '{') - ADVANCE(24); + ADVANCE(23); if (lookahead == '}') - ADVANCE(28); + ADVANCE(27); LEX_ERROR(); case ts_lex_state_error: START_TOKEN(); if (lookahead == '\0') - ADVANCE(26); + ADVANCE(25); if (('\t' <= lookahead && lookahead <= '\n') || (lookahead == '\r') || (lookahead == ' ')) - ADVANCE(39); + ADVANCE(38); if (lookahead == '\"') ADVANCE(2); if (lookahead == ',') - ADVANCE(30); + ADVANCE(29); if ('0' <= lookahead && lookahead <= '9') - ADVANCE(7); + ADVANCE(6); if (lookahead == ':') - ADVANCE(34); + ADVANCE(33); if (lookahead == '[') - ADVANCE(10); + ADVANCE(9); if (lookahead == ']') - ADVANCE(36); + ADVANCE(35); if (lookahead == 'f') - ADVANCE(11); + ADVANCE(10); if (lookahead == 'n') - ADVANCE(16); + ADVANCE(15); if (lookahead == 't') - ADVANCE(20); + ADVANCE(19); if (lookahead == '{') - ADVANCE(24); + ADVANCE(23); if (lookahead == '}') - ADVANCE(28); + ADVANCE(27); LEX_ERROR(); default: LEX_PANIC(); @@ -366,65 +359,65 @@ LEX_FN() { LEX_STATES = { [0] = 1, - [1] = 25, - [2] = 25, - [3] = 27, - [4] = 29, - [5] = 31, - [6] = 25, - [7] = 32, - [8] = 29, - [9] = 31, - [10] = 33, + [1] = 24, + [2] = 24, + [3] = 26, + [4] = 28, + [5] = 30, + [6] = 24, + [7] = 31, + [8] = 28, + [9] = 30, + [10] = 32, [11] = 1, - [12] = 29, - [13] = 31, - [14] = 29, - [15] = 27, - [16] = 29, - [17] = 31, - [18] = 29, - [19] = 33, + [12] = 28, + [13] = 30, + [14] = 28, + [15] = 26, + [16] = 28, + [17] = 30, + [18] = 28, + [19] = 32, [20] = 1, - [21] = 29, - [22] = 31, - [23] = 29, - [24] = 35, - [25] = 37, - [26] = 38, - [27] = 29, + [21] = 28, + [22] = 30, + [23] = 28, + [24] = 34, + [25] = 36, + [26] = 37, + [27] = 28, [28] = 1, - [29] = 37, - [30] = 38, - [31] = 37, - [32] = 27, - [33] = 29, - [34] = 31, - [35] = 37, - [36] = 33, + [29] = 36, + [30] = 37, + [31] = 36, + [32] = 26, + [33] = 28, + [34] = 30, + [35] = 36, + [36] = 32, [37] = 1, - [38] = 29, - [39] = 31, - [40] = 37, - [41] = 37, - [42] = 35, - [43] = 37, - [44] = 38, - [45] = 37, - [46] = 37, - [47] = 29, - [48] = 29, - [49] = 33, + [38] = 28, + [39] = 30, + [40] = 36, + [41] = 36, + [42] = 34, + [43] = 36, + [44] = 37, + [45] = 36, + [46] = 36, + [47] = 28, + [48] = 28, + [49] = 32, [50] = 1, - [51] = 29, - [52] = 31, - [53] = 25, - [54] = 25, - [55] = 35, - [56] = 37, - [57] = 38, - [58] = 25, - [59] = 25, + [51] = 28, + [52] = 30, + [53] = 24, + [54] = 24, + [55] = 34, + [56] = 36, + [57] = 37, + [58] = 24, + [59] = 24, }; #pragma GCC diagnostic push diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index daf8c397..eeb73660 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -55,6 +55,7 @@ namespace tree_sitter { class GrammarError { public: GrammarError(GrammarErrorType type, std::string message); + bool operator==(const GrammarError &other) const; GrammarErrorType type; std::string message; }; diff --git a/spec/compiler/build_tables/item_set_transitions_spec.cc b/spec/compiler/build_tables/item_set_transitions_spec.cc index 0b494280..84c6c1bd 100644 --- a/spec/compiler/build_tables/item_set_transitions_spec.cc +++ b/spec/compiler/build_tables/item_set_transitions_spec.cc @@ -13,8 +13,8 @@ describe("lexical item set transitions", []() { describe("when two items in the set have transitions on the same character", [&]() { it("merges the transitions by computing the union of the two item sets", [&]() { LexItemSet set1({ - LexItem(Symbol(1), pattern("[a-f]")), - LexItem(Symbol(2), pattern("[e-x]")) }); + LexItem(Symbol(1), character({ {'a', 'f'} })), + LexItem(Symbol(2), character({ {'e', 'x'} })) }); AssertThat(char_transitions(set1, grammar), Equals(map({ { CharacterSet({ {'a', 'd'} }), LexItemSet({ diff --git a/spec/compiler/build_tables/rule_transitions_spec.cc b/spec/compiler/build_tables/rule_transitions_spec.cc index 2bd6fba8..eab16713 100644 --- a/spec/compiler/build_tables/rule_transitions_spec.cc +++ b/spec/compiler/build_tables/rule_transitions_spec.cc @@ -97,23 +97,6 @@ describe("rule transitions", []() { }))); }); - it("handles strings", [&]() { - AssertThat( - char_transitions(str("bad")), - Equals(rule_map({ - { CharacterSet({ 'b' }), seq({ character({ 'a' }), character({ 'd' }) }) } - }))); - }); - - it("handles patterns", [&]() { - AssertThat( - char_transitions(pattern("a|b")), - Equals(rule_map({ - { CharacterSet({ 'a' }), blank() }, - { CharacterSet({ 'b' }), blank() } - }))); - }); - it("handles choices between overlapping character sets", [&]() { AssertThat( char_transitions(choice({ @@ -164,7 +147,7 @@ describe("rule transitions", []() { }); it("handles repeats", [&]() { - rule_ptr rule = repeat(str("ab")); + rule_ptr rule = repeat(seq({ character({ 'a' }), character({ 'b' }) })); AssertThat( char_transitions(rule), Equals(rule_map({ @@ -176,7 +159,7 @@ describe("rule transitions", []() { }) }}))); - rule = repeat(str("a")); + rule = repeat(character({ 'a' })); AssertThat( char_transitions(rule), Equals(rule_map({ diff --git a/spec/compiler/helpers/rule_helpers.cc b/spec/compiler/helpers/rule_helpers.cc index 2655c9bb..60404862 100644 --- a/spec/compiler/helpers/rule_helpers.cc +++ b/spec/compiler/helpers/rule_helpers.cc @@ -5,6 +5,7 @@ namespace tree_sitter { using std::make_shared; using std::set; + using std::map; namespace rules { rule_ptr character(const set &ranges) { @@ -33,5 +34,9 @@ namespace tree_sitter { rule_ptr i_aux_token(size_t index) { return make_shared(index, SymbolOption(SymbolOptionAuxiliary|SymbolOptionToken)); } + + rule_ptr metadata(rule_ptr rule, map values) { + return make_shared(rule, values); + } } } diff --git a/spec/compiler/helpers/rule_helpers.h b/spec/compiler/helpers/rule_helpers.h index 3b18229a..66bbbf58 100644 --- a/spec/compiler/helpers/rule_helpers.h +++ b/spec/compiler/helpers/rule_helpers.h @@ -3,9 +3,11 @@ #include "tree_sitter/compiler.h" #include "compiler/rules/character_set.h" +#include "compiler/rules/metadata.h" namespace tree_sitter { namespace rules { + rule_ptr metadata(rule_ptr, std::map); rule_ptr character(const std::set &ranges); rule_ptr character(const std::set &ranges, bool sign); rule_ptr i_sym(size_t index); diff --git a/spec/compiler/prepare_grammar/expand_tokens_spec.cc b/spec/compiler/prepare_grammar/expand_tokens_spec.cc new file mode 100644 index 00000000..f6010ce1 --- /dev/null +++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc @@ -0,0 +1,63 @@ +#include "compiler_spec_helper.h" +#include "compiler/prepared_grammar.h" +#include "compiler/prepare_grammar/expand_tokens.h" + +START_TEST + +using namespace rules; +using prepare_grammar::expand_tokens; + +describe("expanding token rules", []() { + it("replaces regex patterns with their expansion", [&]() { + PreparedGrammar grammar({ + { "rule_A", seq({ + i_sym(10), + pattern("x*"), + i_sym(11) }) }, + }, {}); + + auto result = expand_tokens(grammar); + + AssertThat(result.second, Equals((const GrammarError *)nullptr)); + AssertThat(result.first, Equals(PreparedGrammar({ + { "rule_A", seq({ + i_sym(10), + repeat(character({ 'x' })), + i_sym(11) }) }, + }, {}))); + }); + + it("replaces string rules with a sequence of characters", [&]() { + PreparedGrammar grammar({ + { "rule_A", seq({ + i_sym(10), + str("xyz"), + i_sym(11) }) }, + }, {}); + + auto result = expand_tokens(grammar); + + AssertThat(result.second, Equals((const GrammarError *)nullptr)); + AssertThat(result.first, Equals(PreparedGrammar({ + { "rule_A", seq({ + i_sym(10), + seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }), + i_sym(11) }) }, + }, {}))); + }); + + it("returns an error when the grammar contains an invalid regex", [&]() { + PreparedGrammar grammar({ + { "rule_A", seq({ + pattern("("), + str("xyz"), + pattern("[") }) }, + }, {}); + + auto result = expand_tokens(grammar); + + AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren"))); + }); +}); + +END_TEST \ No newline at end of file diff --git a/spec/compiler/prepare_grammar/parse_regex_spec.cc b/spec/compiler/prepare_grammar/parse_regex_spec.cc new file mode 100644 index 00000000..1111e43b --- /dev/null +++ b/spec/compiler/prepare_grammar/parse_regex_spec.cc @@ -0,0 +1,217 @@ +#include "compiler_spec_helper.h" +#include "compiler/prepare_grammar/parse_regex.h" + +START_TEST + +using namespace rules; +using prepare_grammar::parse_regex; + +describe("parsing regex patterns", []() { + vector> valid_inputs = { + { + "character sets", + "[aAeE]", + character({ 'a', 'A', 'e', 'E' }) + }, + + { + "'.' characters as wildcards", + ".", + CharacterSet({'\n'}).complement().copy() + }, + + { + "character classes", + "\\w-\\d", + seq({ + character({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'} }), + character({ '-' }), + character({ {'0', '9'} }) }) + }, + + { + "choices", + "ab|cd|ef", + choice({ + seq({ + character({ 'a' }), + character({ 'b' }), + }), + seq({ + character({ 'c' }), + character({ 'd' }) + }), + seq({ + character({ 'e' }), + character({ 'f' }) + }) + }) + }, + + { + "simple sequences", + "abc", + seq({ + character({ 'a' }), + character({ 'b' }), + character({ 'c' }) }) + }, + + { + "character ranges", + "[12a-dA-D3]", + character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, }) + }, + + { + "negated characters", + "[^a\\d]", + character({ {'a'}, {'0', '9'} }, false) + }, + + { + "backslashes", + "\\\\", + character({ '\\' }) + }, + + { + "character groups in sequences", + "x([^x]|\\\\x)*x", + seq({ + character({ 'x' }), + repeat(choice({ + character({ 'x' }, false), + seq({ character({ '\\' }), character({ 'x' }) }) + })), + character({ 'x' }) + }) + }, + + { + "choices in sequences", + "(a|b)cd", + seq({ + choice({ + character({ 'a' }), + character({ 'b' }), + }), + character({ 'c' }), + character({ 'd' }) + }) + }, + + { + "escaped parentheses", + "a\\(b", + seq({ + character({ 'a' }), + character({ '(' }), + character({ 'b' }) + }) + }, + + { + "escaped periods", + "a\\.", + seq({ + character({ 'a' }), + character({ '.' }) + }) + }, + + { + "plus repeats", + "(ab)+(cd)+", + seq({ + seq({ + seq({ character({ 'a' }), character({ 'b' }) }), + repeat(seq({ character({ 'a' }), character({ 'b' }) })), + }), + seq({ + seq({ character({ 'c' }), character({ 'd' }) }), + repeat(seq({ character({ 'c' }), character({ 'd' }) })), + }), + }) + }, + + { + "asterix repeats", + "(ab)*(cd)*", + seq({ + repeat(seq({ character({ 'a' }), character({ 'b' }) })), + repeat(seq({ character({ 'c' }), character({ 'd' }) })), + }) + }, + + { + "optional rules", + "a(bc)?", + seq({ + character({ 'a' }), + choice({ + seq({ character({ 'b' }), character({ 'c' }) }), + blank() + }) + }) + } + }; + + vector> invalid_inputs = { + { + "mismatched open parens", + "(a", + "unmatched open paren", + }, + { + "mismatched nested open parens", + "((a) (b)", + "unmatched open paren", + }, + { + "mismatched close parens", + "a)", + "unmatched close paren", + }, + { + "mismatched nested close parens", + "((a) b))", + "unmatched close paren", + }, + { + "mismatched brackets for character classes", + "[a", + "unmatched open square bracket", + }, + { + "mismatched brackets for character classes", + "a]", + "unmatched close square bracket", + }, + }; + + for (auto &triple : valid_inputs) { + string description = get<0>(triple); + string regex = get<1>(triple); + rule_ptr rule = get<2>(triple); + + it(("parses " + description).c_str(), [&]() { + auto result = parse_regex(regex); + AssertThat(result.first, EqualsPointer(rule)); + }); + } + + for (auto &triple : invalid_inputs) { + string description = get<0>(triple); + string regex = get<1>(triple); + const char *expected_message = get<2>(triple); + + it(("handles invalid regexes with " + description).c_str(), [&]() { + auto result = parse_regex(regex); + AssertThat(result.second, !Equals((const GrammarError *)nullptr)); + AssertThat(result.second->message, Contains(expected_message)); + }); + } +}); + +END_TEST \ No newline at end of file diff --git a/spec/compiler/rules/pattern_spec.cc b/spec/compiler/rules/pattern_spec.cc deleted file mode 100644 index 5947269b..00000000 --- a/spec/compiler/rules/pattern_spec.cc +++ /dev/null @@ -1,177 +0,0 @@ -#include "compiler_spec_helper.h" -#include "compiler/rules/pattern.h" -#include "compiler/rules/character_set.h" - -using namespace rules; - -START_TEST - -describe("parsing regex pattern rules", []() { - it("parses simple strings", [&]() { - Pattern rule("abc"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - character({ 'a' }), - character({ 'b' }), - character({ 'c' }) - }))); - }); - - it("parses wildcard '.' characters", [&]() { - Pattern rule("."); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(CharacterSet({'\n'}).complement().copy())); - }); - - it("parses character classes", []() { - Pattern rule("\\w-\\d"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - character({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'} }), - character({ '-' }), - character({ {'0', '9'} }) - }))); - }); - - it("parses choices", []() { - Pattern rule("ab|cd|ef"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(choice({ - seq({ - character({ 'a' }), - character({ 'b' }), - }), - seq({ - character({ 'c' }), - character({ 'd' }) - }), - seq({ - character({ 'e' }), - character({ 'f' }) - }) - }))); - }); - - it("parses character sets", []() { - Pattern rule("[aAeE]"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(character({ 'a', 'A', 'e', 'E' }))); - }); - - it("parses character ranges", []() { - Pattern rule("[12a-dA-D3]"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, }))); - }); - - it("parses negated characters", []() { - Pattern rule("[^a\\d]"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(character({ {'a'}, {'0', '9'} }, false))); - }); - - it("parses backslashes", []() { - Pattern rule("\\\\"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(character({ '\\' }))); - }); - - it("parses character groups in sequences", []() { - Pattern rule("\"([^\"]|\\\\\")*\""); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - character({ '"' }), - repeat(choice({ - character({ '"' }, false), - seq({ character({ '\\' }), character({ '"' }) }) - })), - character({ '"' }) - }))); - }); - - it("parses choices in sequences", []() { - Pattern rule("(a|b)cd"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - choice({ - character({ 'a' }), - character({ 'b' }), - }), - character({ 'c' }), - character({ 'd' }) - }))); - }); - - it("parses special characters when they are escaped", []() { - Pattern rule("a\\(b"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - character({ 'a' }), - character({ '(' }), - character({ 'b' }) - }))); - - Pattern rule2("a\\."); - AssertThat( - rule2.to_rule_tree(), - EqualsPointer(seq({ - character({ 'a' }), - character({ '.' }), - }))); - - }); - - it("parses repeating rules", []() { - Pattern rule("(ab)+(cd)+"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer( - seq({ - seq({ - seq({ character({ 'a' }), character({ 'b' }) }), - repeat(seq({ character({ 'a' }), character({ 'b' }) })), - }), - seq({ - seq({ character({ 'c' }), character({ 'd' }) }), - repeat(seq({ character({ 'c' }), character({ 'd' }) })), - }), - }) - )); - - Pattern rule2("(ab)*(cd)*"); - AssertThat( - rule2.to_rule_tree(), - EqualsPointer( - seq({ - repeat(seq({ character({ 'a' }), character({ 'b' }) })), - repeat(seq({ character({ 'c' }), character({ 'd' }) })), - }) - )); - }); - - it("parses optional rules", []() { - Pattern rule("a(bc)?"); - AssertThat( - rule.to_rule_tree(), - EqualsPointer(seq({ - character({ 'a' }), - choice({ - seq({ character({ 'b' }), character({ 'c' }) }), - blank() - }) - }))); - }); -}); - -END_TEST diff --git a/src/compiler/build_tables/rule_transitions.cc b/src/compiler/build_tables/rule_transitions.cc index d2dc999c..d29da169 100644 --- a/src/compiler/build_tables/rule_transitions.cc +++ b/src/compiler/build_tables/rule_transitions.cc @@ -94,20 +94,6 @@ namespace tree_sitter { }); return result; } - - map apply_to(const rules::String *rule) { - rule_ptr result = make_shared(); - for (char val : rule->value) - result = rules::Seq::Build({ - result, - CharacterSet({ val }).copy() - }); - return this->apply(result); - } - - map apply_to(const rules::Pattern *rule) { - return this->apply(rule->to_rule_tree()); - } }; map char_transitions(const rule_ptr &rule) { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 8a3d8649..1a76c2f8 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -113,8 +113,7 @@ namespace tree_sitter { } else if (symbol.is_token() && symbol.is_auxiliary()) { return token_description(grammar_for_symbol(symbol).rule(symbol)); } else { - string name = grammar_for_symbol(symbol).rule_name(symbol); - return name; + return grammar_for_symbol(symbol).rule_name(symbol); } } diff --git a/src/compiler/grammar.cc b/src/compiler/grammar.cc index a39eabd9..480ba258 100644 --- a/src/compiler/grammar.cc +++ b/src/compiler/grammar.cc @@ -48,6 +48,10 @@ namespace tree_sitter { GrammarError::GrammarError(GrammarErrorType type, std::string message) : type(type), message(message) {} + + bool GrammarError::operator==(const GrammarError &other) const { + return type == other.type && message == other.message; + } ostream& operator<<(ostream &stream, const GrammarError *error) { if (error) diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc new file mode 100644 index 00000000..afc14ccc --- /dev/null +++ b/src/compiler/prepare_grammar/expand_tokens.cc @@ -0,0 +1,68 @@ +#include "compiler/prepare_grammar/expand_tokens.h" +#include +#include +#include +#include "compiler/prepared_grammar.h" +#include "compiler/rules/visitor.h" +#include "compiler/rules/pattern.h" +#include "compiler/rules/string.h" +#include "compiler/rules/blank.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/character_set.h" +#include "compiler/prepare_grammar/parse_regex.h" + +namespace tree_sitter { + using std::string; + using std::vector; + using std::pair; + using std::make_shared; + using rules::rule_ptr; + using rules::String; + using rules::Pattern; + + namespace prepare_grammar { + class ExpandTokens : public rules::IdentityRuleFn { + using rules::IdentityRuleFn::apply_to; + + rule_ptr apply_to(const String *rule) { + vector elements; + for (char val : rule->value) + elements.push_back(rules::CharacterSet({ val }).copy()); + return rules::Seq::Build(elements); + } + + rule_ptr apply_to(const Pattern *rule) { + auto pair = parse_regex(rule->value); + if (!error) + error = pair.second; + return pair.first; + } + + public: + const GrammarError *error; + ExpandTokens() : error(nullptr) {} + }; + + pair + expand_tokens(const PreparedGrammar &grammar) { + vector> rules, aux_rules; + ExpandTokens expander; + + for (auto &pair : grammar.rules) { + auto rule = expander.apply(pair.second); + if (expander.error) + return { PreparedGrammar(), expander.error }; + rules.push_back({ pair.first, rule }); + } + + for (auto &pair : grammar.aux_rules) { + auto rule = expander.apply(pair.second); + if (expander.error) + return { PreparedGrammar(), expander.error }; + aux_rules.push_back({ pair.first, rule }); + } + + return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr }; + } + } +} diff --git a/src/compiler/prepare_grammar/expand_tokens.h b/src/compiler/prepare_grammar/expand_tokens.h new file mode 100644 index 00000000..8658152a --- /dev/null +++ b/src/compiler/prepare_grammar/expand_tokens.h @@ -0,0 +1,16 @@ +#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ +#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ + +#include "tree_sitter/compiler.h" + +namespace tree_sitter { + class PreparedGrammar; + + namespace prepare_grammar { + std::pair + expand_tokens(const PreparedGrammar &); + } +} + +#endif // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ + diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc new file mode 100644 index 00000000..e2767860 --- /dev/null +++ b/src/compiler/prepare_grammar/parse_regex.cc @@ -0,0 +1,210 @@ +#include "compiler/prepare_grammar/parse_regex.h" +#include +#include +#include "compiler/rules/choice.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/repeat.h" +#include "compiler/rules/character_set.h" +#include "compiler/rules/blank.h" +#include "compiler/util/string_helpers.h" + +namespace tree_sitter { + using std::string; + using std::vector; + using std::pair; + using std::make_shared; + using rules::rule_ptr; + using rules::CharacterSet; + using rules::Seq; + using rules::Blank; + using rules::Choice; + using rules::Repeat; + using rules::CharacterRange; + using rules::blank; + + namespace prepare_grammar { + class PatternParser { + public: + explicit PatternParser(const string &input) : + input(input), + length(input.length()), + position(0) {} + + pair rule(bool nested) { + vector choices = {}; + do { + if (!choices.empty()) { + if (peek() == '|') + next(); + else + break; + } + auto pair = term(nested); + if (pair.second) + return { blank(), pair.second }; + choices.push_back(pair.first); + } while (has_more_input()); + auto rule = (choices.size() > 1) ? make_shared(choices) : choices.front(); + return { rule, nullptr }; + } + + private: + pair term(bool nested) { + rule_ptr result = blank(); + do { + if (peek() == '|') + break; + if (nested && peek() == ')') + break; + auto pair = factor(); + if (pair.second) + return { blank(), pair.second }; + result = Seq::Build({ result, pair.first }); + } while (has_more_input()); + return { result, nullptr }; + } + + pair factor() { + auto pair = atom(); + if (pair.second) + return { blank(), pair.second }; + rule_ptr result = pair.first; + if (has_more_input()) { + switch (peek()) { + case '*': + next(); + result = make_shared(result); + break; + case '+': + next(); + result = make_shared(result, make_shared(result)); + break; + case '?': + next(); + result = Choice::Build({ result, make_shared() }); + break; + } + } + return { result, nullptr }; + } + + pair atom() { + switch (peek()) { + case '(': { + next(); + auto pair = rule(true); + if (pair.second) + return { blank(), pair.second }; + if (peek() != ')') + return error("unmatched open paren"); + next(); + return { pair.first, nullptr }; + } + case '[': { + next(); + auto pair = char_set(); + if (pair.second) + return { blank(), pair.second }; + if (peek() != ']') + return error("unmatched open square bracket"); + next(); + return { pair.first.copy(), nullptr }; + } + case ')': { + return error("unmatched close paren"); + } + case ']': { + return error("unmatched close square bracket"); + } + case '.': { + next(); + return { CharacterSet({ '\n' }).complement().copy(), nullptr }; + } + default: { + auto pair = single_char(); + if (pair.second) + return { blank(), pair.second }; + return { pair.first.copy(), nullptr }; + } + } + } + + pair char_set() { + bool is_affirmative = true; + if (peek() == '^') { + next(); + is_affirmative = false; + } + CharacterSet result; + while (has_more_input() && (peek() != ']')) { + auto pair = single_char(); + if (pair.second) + return { CharacterSet(), pair.second }; + result.add_set(pair.first); + } + if (!is_affirmative) + result = result.complement(); + return { result, nullptr }; + } + + pair single_char() { + CharacterSet value; + switch (peek()) { + case '\\': + next(); + value = escaped_char(peek()); + next(); + break; + default: + char first_char = peek(); + next(); + if (peek() == '-') { + next(); + value = CharacterSet({ CharacterRange(first_char, peek()) }); + next(); + } else { + value = CharacterSet({ first_char }); + } + } + return { value, nullptr }; + } + + CharacterSet escaped_char(char value) { + switch (value) { + case 'a': + return CharacterSet({ {'a', 'z'}, {'A', 'Z'} }); + case 'w': + return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}}); + case 'd': + return CharacterSet({ {'0', '9'} }); + default: + return CharacterSet({ value }); + } + } + + void next() { + position++; + } + + char peek() { + return input[position]; + } + + bool has_more_input() { + return position < length; + } + + pair error(string msg) { + return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) }; + } + + const string input; + const size_t length; + size_t position; + }; + + pair parse_regex(const std::string &input) { + return PatternParser(input).rule(false); + } + } +} diff --git a/src/compiler/prepare_grammar/parse_regex.h b/src/compiler/prepare_grammar/parse_regex.h new file mode 100644 index 00000000..903edd83 --- /dev/null +++ b/src/compiler/prepare_grammar/parse_regex.h @@ -0,0 +1,16 @@ +#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ +#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ + +#include "tree_sitter/compiler.h" +#include +#include + +namespace tree_sitter { + namespace prepare_grammar { + std::pair + parse_regex(const std::string &); + } +} + + +#endif // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ \ No newline at end of file diff --git a/src/compiler/prepare_grammar/prepare_grammar.cc b/src/compiler/prepare_grammar/prepare_grammar.cc index c66af384..f41900b1 100644 --- a/src/compiler/prepare_grammar/prepare_grammar.cc +++ b/src/compiler/prepare_grammar/prepare_grammar.cc @@ -2,8 +2,11 @@ #include "compiler/prepared_grammar.h" #include "compiler/prepare_grammar/extract_tokens.h" #include "compiler/prepare_grammar/expand_repeats.h" +#include "compiler/prepare_grammar/expand_tokens.h" #include "compiler/prepare_grammar/intern_symbols.h" +#include "stream_methods.h" + namespace tree_sitter { using std::tuple; using std::make_tuple; @@ -16,12 +19,17 @@ namespace tree_sitter { const GrammarError *error = result.second; if (error) - return make_tuple(PreparedGrammar({}, {}), PreparedGrammar({}, {}), error); + return make_tuple(PreparedGrammar(), PreparedGrammar(), error); auto grammars = extract_tokens(grammar); const PreparedGrammar &rule_grammar = expand_repeats(grammars.first); - const PreparedGrammar &lex_grammar = grammars.second; - + auto expand_tokens_result = expand_tokens(grammars.second); + const PreparedGrammar &lex_grammar = expand_tokens_result.first; + error = expand_tokens_result.second; + + if (error) + return make_tuple(PreparedGrammar(), PreparedGrammar(), error); + return make_tuple(rule_grammar, lex_grammar, nullptr); } } diff --git a/src/compiler/prepared_grammar.cc b/src/compiler/prepared_grammar.cc index 1f2d36fc..8f68940a 100644 --- a/src/compiler/prepared_grammar.cc +++ b/src/compiler/prepared_grammar.cc @@ -10,6 +10,8 @@ namespace tree_sitter { using std::ostream; using rules::rule_ptr; using rules::Symbol; + + PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {} PreparedGrammar::PreparedGrammar(const std::vector> &rules, const std::vector> &aux_rules) : diff --git a/src/compiler/prepared_grammar.h b/src/compiler/prepared_grammar.h index 158835a2..7378f38b 100644 --- a/src/compiler/prepared_grammar.h +++ b/src/compiler/prepared_grammar.h @@ -14,6 +14,7 @@ namespace tree_sitter { class PreparedGrammar : public Grammar { public: + PreparedGrammar(); PreparedGrammar(const std::vector> &rules, const std::vector> &aux_rules); PreparedGrammar(const std::vector> &rules, diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h index 97c7b761..c8fd7d59 100644 --- a/src/compiler/rules/metadata.h +++ b/src/compiler/rules/metadata.h @@ -11,6 +11,7 @@ namespace tree_sitter { START_TOKEN, PRECEDENCE, IS_TOKEN, + DESCRIPTION, } MetadataKey; class Metadata : public Rule { diff --git a/src/compiler/rules/pattern.cc b/src/compiler/rules/pattern.cc index 45c71996..f3d839c1 100644 --- a/src/compiler/rules/pattern.cc +++ b/src/compiler/rules/pattern.cc @@ -1,173 +1,12 @@ #include "compiler/rules/pattern.h" -#include #include -#include #include "compiler/rules/visitor.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/character_set.h" -#include "compiler/rules/blank.h" #include "compiler/util/string_helpers.h" namespace tree_sitter { namespace rules { using std::string; using std::hash; - using std::make_shared; - using std::set; - using std::vector; - - class PatternParser { - public: - explicit PatternParser(const string &input) : - input(input), - length(input.length()), - position(0) {} - - rule_ptr rule() { - vector choices = { term() }; - while (has_more_input() && peek() == '|') { - next(); - choices.push_back(term()); - } - return (choices.size() > 1) ? Choice::Build(choices) : choices.front(); - } - - private: - rule_ptr term() { - rule_ptr result = factor(); - while (has_more_input() && (peek() != '|') && (peek() != ')')) - result = Seq::Build({ result, factor() }); - return result; - } - - rule_ptr factor() { - rule_ptr result = atom(); - if (has_more_input()) { - switch (peek()) { - case '*': - next(); - result = make_shared(result); - break; - case '+': - next(); - result = make_shared(result, make_shared(result)); - break; - case '?': - next(); - result = Choice::Build({ result, make_shared() }); - break; - } - } - return result; - } - - rule_ptr atom() { - rule_ptr result; - switch (peek()) { - case '(': - next(); - result = rule(); - if (has_error()) return result; - if (peek() != ')') { - error = "mismatched parens"; - return result; - } - next(); - break; - case '[': - next(); - result = char_set().copy(); - if (has_error()) return result; - if (peek() != ']') { - error = "mismatched square brackets"; - return result; - } - next(); - break; - case ')': - error = "mismatched parens"; - break; - case '.': - result = CharacterSet({ '\n' }).complement().copy(); - next(); - break; - default: - result = single_char().copy(); - } - return result; - } - - CharacterSet char_set() { - bool is_affirmative = true; - if (peek() == '^') { - next(); - is_affirmative = false; - } - CharacterSet result; - while (has_more_input() && (peek() != ']')) - result.add_set(single_char()); - return is_affirmative ? result : result.complement(); - } - - CharacterSet single_char() { - CharacterSet value; - switch (peek()) { - case '\\': - next(); - value = escaped_char(peek()); - if (has_error()) return value; - next(); - break; - default: - char first_char = peek(); - next(); - if (peek() == '-') { - next(); - value = CharacterSet({ CharacterRange(first_char, peek()) }); - next(); - } else { - value = CharacterSet({ first_char }); - } - } - return value; - } - - CharacterSet escaped_char(char value) { - switch (value) { - case 'a': - return CharacterSet({ {'a', 'z'}, {'A', 'Z'} }); - case 'w': - return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}}); - case 'd': - return CharacterSet({ {'0', '9'} }); - default: - return CharacterSet({ value }); - } - } - - void next() { - position++; - } - - char peek() { - return input[position]; - } - - bool has_more_input() { - return position < length; - } - - bool has_error() { - return error != ""; - } - - string error; - const string input; - const size_t length; - size_t position; - }; Pattern::Pattern(const string &string) : value(string) {} @@ -191,9 +30,5 @@ namespace tree_sitter { void Pattern::accept(Visitor *visitor) const { visitor->visit(this); } - - rule_ptr Pattern::to_rule_tree() const { - return PatternParser(value).rule(); - } } } diff --git a/src/compiler/rules/pattern.h b/src/compiler/rules/pattern.h index de452a76..962adc29 100644 --- a/src/compiler/rules/pattern.h +++ b/src/compiler/rules/pattern.h @@ -17,7 +17,6 @@ namespace tree_sitter { void accept(Visitor *visitor) const; const std::string value; - rule_ptr to_rule_tree() const; }; } }