From 60e2d00b4dc628885675bfce7ea2dcef6335d7f8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 30 Jan 2014 13:04:31 -0800 Subject: [PATCH] Parse simple character sets in pattern rules --- spec/compiler/rules/pattern_spec.cpp | 35 ++++++ spec/fixtures/grammars/json.cpp | 2 +- spec/fixtures/parsers/arithmetic.c | 156 +++++++++++++-------------- spec/fixtures/parsers/json.c | 68 ++++++------ spec/runtime/json_spec.cpp | 2 +- src/compiler/rules/pattern.cpp | 56 +++++++--- 6 files changed, 192 insertions(+), 127 deletions(-) diff --git a/spec/compiler/rules/pattern_spec.cpp b/spec/compiler/rules/pattern_spec.cpp index dba04ece..c1641628 100644 --- a/spec/compiler/rules/pattern_spec.cpp +++ b/spec/compiler/rules/pattern_spec.cpp @@ -48,6 +48,41 @@ describe("parsing pattern rules", []() { }))); }); + it("parses character sets", []() { + Pattern rule("[abc]"); + AssertThat( + rule.to_rule_tree(), + EqualsPointer(character({ 'a', 'b', 'c' }, true))); + }); + + it("parses negated characters", []() { + Pattern rule("[^a\\d]"); + AssertThat( + rule.to_rule_tree(), + EqualsPointer(character({ 'a', CharClassDigit }, false))); + }); + + it("parses backslashes", []() { + Pattern rule("\\\\"); + AssertThat( + rule.to_rule_tree(), + EqualsPointer(character('\\'))); + }); + + it("parses character groups in sequences", []() { + Pattern rule("\"([^\"]|\\\\\")+\""); + AssertThat( + rule.to_rule_tree(), + EqualsPointer(seq({ + character('"'), + repeat(choice({ + character({ '"' }, false), + seq({ character('\\'), character('"') }) + })), + character('"') + }))); + }); + it("parses choices in sequences", []() { Pattern rule("(a|b)cd"); AssertThat( diff --git a/spec/fixtures/grammars/json.cpp b/spec/fixtures/grammars/json.cpp index e5c25e06..e2d6e9d1 100644 --- a/spec/fixtures/grammars/json.cpp +++ b/spec/fixtures/grammars/json.cpp @@ -35,7 +35,7 @@ namespace test_grammars { str("]"), }) }, { "string", seq({ str("\""), - pattern("\\w+"), + repeat(pattern("[^\"]")), str("\"") }) }, { "number", pattern("\\d+") } }); diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index ec8f267e..ef4723e0 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -3,27 +3,27 @@ enum ts_symbol { ts_symbol_factor, - ts_aux_token2, - ts_symbol_times, ts_aux_token1, - ts_symbol_variable, - ts_symbol_term, - ts_symbol_plus, - ts_symbol_expression, + ts_aux_token2, ts_symbol_number, + ts_symbol_variable, + ts_symbol_plus, + ts_symbol_times, + ts_symbol_term, + ts_symbol_expression, ts_symbol___END__, }; static const char *ts_symbol_names[] = { "factor", - "token2", - "times", "token1", - "variable", - "term", - "plus", - "expression", + "token2", "number", + "variable", + "plus", + "times", + "term", + "expression", "__END__", }; @@ -73,10 +73,10 @@ static void ts_lex(TSParser *parser) { ADVANCE(8); LEX_ERROR(2, EXPECT({"')'", "'+'"})); case 10: - if (LOOKAHEAD_CHAR() == '(') - ADVANCE(12); if (isalnum(LOOKAHEAD_CHAR())) ADVANCE(13); + if (LOOKAHEAD_CHAR() == '(') + ADVANCE(12); if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); LEX_ERROR(3, EXPECT({"", "'('", ""})); @@ -118,18 +118,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(52); - case ts_symbol_variable: - SHIFT(47); case ts_aux_token1: SHIFT(49); case ts_symbol_number: SHIFT(48); + case ts_symbol_variable: + SHIFT(47); case ts_symbol_term: SHIFT(2); case ts_symbol_expression: SHIFT(1); default: - PARSE_ERROR(6, EXPECT({"expression", "variable", "token1", "term", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"expression", "term", "variable", "number", "token1", "factor"})); } case 1: SET_LEX_STATE(0); @@ -152,10 +152,10 @@ static TSParseResult ts_parse(const char *input) { case 3: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(7); case ts_symbol_factor: SHIFT(39); + case ts_aux_token1: + SHIFT(7); case ts_symbol_number: SHIFT(6); case ts_symbol_variable: @@ -163,7 +163,7 @@ static TSParseResult ts_parse(const char *input) { case ts_symbol_term: SHIFT(4); default: - PARSE_ERROR(5, EXPECT({"term", "variable", "number", "factor", "token1"})); + PARSE_ERROR(5, EXPECT({"term", "variable", "number", "token1", "factor"})); } case 4: SET_LEX_STATE(0); @@ -198,18 +198,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(37); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(37); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 8: SET_LEX_STATE(9); @@ -224,10 +224,10 @@ static TSParseResult ts_parse(const char *input) { case 9: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(13); case ts_symbol_factor: SHIFT(29); + case ts_aux_token1: + SHIFT(13); case ts_symbol_number: SHIFT(12); case ts_symbol_variable: @@ -235,7 +235,7 @@ static TSParseResult ts_parse(const char *input) { case ts_symbol_term: SHIFT(10); default: - PARSE_ERROR(5, EXPECT({"term", "variable", "number", "factor", "token1"})); + PARSE_ERROR(5, EXPECT({"term", "variable", "number", "token1", "factor"})); } case 10: SET_LEX_STATE(4); @@ -270,18 +270,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(27); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(27); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 14: SET_LEX_STATE(7); @@ -312,18 +312,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(17); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(17); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 17: SET_LEX_STATE(4); @@ -348,28 +348,28 @@ static TSParseResult ts_parse(const char *input) { case 19: SET_LEX_STATE(7); switch (LOOKAHEAD_SYM()) { - case ts_symbol_plus: - REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_aux_token2: REDUCE(ts_symbol_term, 1, COLLAPSE({0})); + case ts_symbol_plus: + REDUCE(ts_symbol_term, 1, COLLAPSE({0})); case ts_symbol_times: SHIFT(20); default: - PARSE_ERROR(3, EXPECT({"times", "token2", "plus"})); + PARSE_ERROR(3, EXPECT({"times", "plus", "token2"})); } case 20: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(23); case ts_symbol_factor: SHIFT(26); + case ts_aux_token1: + SHIFT(23); case ts_symbol_number: SHIFT(22); case ts_symbol_variable: SHIFT(21); default: - PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"})); + PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"})); } case 21: SET_LEX_STATE(9); @@ -396,18 +396,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(24); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(24); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 24: SET_LEX_STATE(4); @@ -468,16 +468,16 @@ static TSParseResult ts_parse(const char *input) { case 30: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(33); case ts_symbol_factor: SHIFT(36); + case ts_aux_token1: + SHIFT(33); case ts_symbol_number: SHIFT(32); case ts_symbol_variable: SHIFT(31); default: - PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"})); + PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"})); } case 31: SET_LEX_STATE(4); @@ -500,18 +500,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(34); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(34); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 34: SET_LEX_STATE(4); @@ -568,16 +568,16 @@ static TSParseResult ts_parse(const char *input) { case 40: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(43); case ts_symbol_factor: SHIFT(46); + case ts_aux_token1: + SHIFT(43); case ts_symbol_number: SHIFT(42); case ts_symbol_variable: SHIFT(41); default: - PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"})); + PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"})); } case 41: SET_LEX_STATE(0); @@ -600,18 +600,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(44); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(44); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 44: SET_LEX_STATE(4); @@ -666,18 +666,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(50); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(50); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 50: SET_LEX_STATE(4); @@ -714,16 +714,16 @@ static TSParseResult ts_parse(const char *input) { case 53: SET_LEX_STATE(10); switch (LOOKAHEAD_SYM()) { - case ts_aux_token1: - SHIFT(56); case ts_symbol_factor: SHIFT(59); + case ts_aux_token1: + SHIFT(56); case ts_symbol_number: SHIFT(55); case ts_symbol_variable: SHIFT(54); default: - PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"})); + PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"})); } case 54: SET_LEX_STATE(14); @@ -750,18 +750,18 @@ static TSParseResult ts_parse(const char *input) { switch (LOOKAHEAD_SYM()) { case ts_symbol_factor: SHIFT(19); - case ts_symbol_variable: - SHIFT(14); + case ts_symbol_expression: + SHIFT(57); case ts_aux_token1: SHIFT(16); case ts_symbol_number: SHIFT(15); - case ts_symbol_expression: - SHIFT(57); + case ts_symbol_variable: + SHIFT(14); case ts_symbol_term: SHIFT(8); default: - PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"})); + PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"})); } case 57: SET_LEX_STATE(4); diff --git a/spec/fixtures/parsers/json.c b/spec/fixtures/parsers/json.c index f22b7830..ce14f66a 100644 --- a/spec/fixtures/parsers/json.c +++ b/spec/fixtures/parsers/json.c @@ -2,39 +2,39 @@ #include enum ts_symbol { - ts_symbol_array, - ts_aux_token6, - ts_aux_repeat_helper2, - ts_aux_token5, ts_symbol_string, - ts_symbol_value, - ts_symbol_object, - ts_aux_token4, - ts_aux_token7, - ts_symbol_number, - ts_aux_token2, - ts_aux_token3, - ts_aux_token1, ts_aux_repeat_helper1, + ts_aux_token5, + ts_aux_repeat_helper2, + ts_symbol_object, + ts_aux_token6, + ts_aux_token7, + ts_aux_token4, + ts_aux_token1, + ts_symbol_array, ts_symbol___END__, + ts_symbol_value, + ts_symbol_number, + ts_aux_token3, + ts_aux_token2, }; static const char *ts_symbol_names[] = { - "array", - "token6", - "repeat_helper2", - "token5", "string", - "value", - "object", - "token4", - "token7", - "number", - "token2", - "token3", - "token1", "repeat_helper1", + "token5", + "repeat_helper2", + "object", + "token6", + "token7", + "token4", + "token1", + "array", "__END__", + "value", + "number", + "token3", + "token2", }; static void ts_lex(TSParser *parser) { @@ -77,29 +77,29 @@ static void ts_lex(TSParser *parser) { ADVANCE(3); LEX_ERROR(2, EXPECT({"'}'", "','"})); case 10: + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(16); if (LOOKAHEAD_CHAR() == '[') ADVANCE(15); if (LOOKAHEAD_CHAR() == '\"') ADVANCE(12); - if (LOOKAHEAD_CHAR() == '{') - ADVANCE(16); if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); - LEX_ERROR(4, EXPECT({"'['", "'\"'", "'{'", ""})); + LEX_ERROR(4, EXPECT({"'{'", "'['", "'\"'", ""})); case 11: if (isdigit(LOOKAHEAD_CHAR())) ADVANCE(11); ACCEPT_TOKEN(ts_symbol_number); case 12: - if (isalnum(LOOKAHEAD_CHAR())) + if (!(LOOKAHEAD_CHAR() == '\"')) ADVANCE(13); - LEX_ERROR(1, EXPECT({""})); + LEX_ERROR(1, EXPECT({"'\"'"})); case 13: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(14); - if (isalnum(LOOKAHEAD_CHAR())) + if (!(LOOKAHEAD_CHAR() == '\"')) ADVANCE(13); - LEX_ERROR(2, EXPECT({"'\"'", ""})); + LEX_ERROR(1, EXPECT({"'\"'"})); case 14: ACCEPT_TOKEN(ts_symbol_string); case 15: @@ -788,14 +788,14 @@ static TSParseResult ts_parse(const char *input) { case 59: SET_LEX_STATE(6); switch (LOOKAHEAD_SYM()) { - case ts_aux_token2: - SHIFT(9); case ts_aux_token4: REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0})); + case ts_aux_token2: + SHIFT(9); case ts_aux_repeat_helper2: SHIFT(60); default: - PARSE_ERROR(3, EXPECT({"repeat_helper2", "token4", "token2"})); + PARSE_ERROR(3, EXPECT({"repeat_helper2", "token2", "token4"})); } case 60: SET_LEX_STATE(4); diff --git a/spec/runtime/json_spec.cpp b/spec/runtime/json_spec.cpp index 92628713..a415784d 100644 --- a/spec/runtime/json_spec.cpp +++ b/spec/runtime/json_spec.cpp @@ -14,7 +14,7 @@ describe("json", []() { }); it("parses strings", [&]() { - TSDocumentSetText(document, "\"string\""); + TSDocumentSetText(document, "\"this is a string\""); AssertThat(string(TSDocumentToString(document)), Equals("(value (string))")); }); diff --git a/src/compiler/rules/pattern.cpp b/src/compiler/rules/pattern.cpp index e1efa287..be2d3bba 100644 --- a/src/compiler/rules/pattern.cpp +++ b/src/compiler/rules/pattern.cpp @@ -38,6 +38,18 @@ namespace tree_sitter { return result; } + rule_ptr char_set() { + bool is_affirmative = true; + if (peek() == '^') { + next(); + is_affirmative = false; + } + std::vector matches; + while (has_more_input() && (peek() != ']')) + matches.push_back(single_char()); + return character(matches, is_affirmative); + } + rule_ptr atom() { rule_ptr result; switch (peek()) { @@ -49,34 +61,52 @@ namespace tree_sitter { else next(); break; + case '[': + next(); + result = char_set(); + if (peek() != ']') + error("mismatched square brackets"); + else + next(); + break; case ')': error("mismatched parens"); break; - case '\\': - next(); - result = escaped_char(peek()); - next(); - break; default: - result = character(peek()); - next(); - break; + result = character({ single_char() }, true); } return result; } - rule_ptr escaped_char(char value) { + CharacterMatch single_char() { + CharacterMatch value('\0'); + switch (peek()) { + case '\\': + next(); + value = escaped_char(peek()); + next(); + break; + default: + value = peek(); + next(); + return value; + } + return value; + } + + CharacterMatch escaped_char(char value) { switch (value) { + case '\\': case '(': case ')': - return character(value); + return value; case 'w': - return character(CharClassWord); + return CharClassWord; case 'd': - return character(CharClassDigit); + return CharClassDigit; default: error("unrecognized escape sequence"); - return rule_ptr(); + return '\0'; } }