Parse simple character sets in pattern rules

This commit is contained in:
Max Brunsfeld 2014-01-30 13:04:31 -08:00
parent 28e10dc722
commit 60e2d00b4d
6 changed files with 192 additions and 127 deletions

View file

@ -48,6 +48,41 @@ describe("parsing pattern rules", []() {
})));
});
it("parses character sets", []() {
Pattern rule("[abc]");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character({ 'a', 'b', 'c' }, true)));
});
it("parses negated characters", []() {
Pattern rule("[^a\\d]");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character({ 'a', CharClassDigit }, false)));
});
it("parses backslashes", []() {
Pattern rule("\\\\");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character('\\')));
});
it("parses character groups in sequences", []() {
Pattern rule("\"([^\"]|\\\\\")+\"");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(seq({
character('"'),
repeat(choice({
character({ '"' }, false),
seq({ character('\\'), character('"') })
})),
character('"')
})));
});
it("parses choices in sequences", []() {
Pattern rule("(a|b)cd");
AssertThat(

View file

@ -35,7 +35,7 @@ namespace test_grammars {
str("]"), }) },
{ "string", seq({
str("\""),
pattern("\\w+"),
repeat(pattern("[^\"]")),
str("\"") }) },
{ "number", pattern("\\d+") }
});

View file

@ -3,27 +3,27 @@
enum ts_symbol {
ts_symbol_factor,
ts_aux_token2,
ts_symbol_times,
ts_aux_token1,
ts_symbol_variable,
ts_symbol_term,
ts_symbol_plus,
ts_symbol_expression,
ts_aux_token2,
ts_symbol_number,
ts_symbol_variable,
ts_symbol_plus,
ts_symbol_times,
ts_symbol_term,
ts_symbol_expression,
ts_symbol___END__,
};
static const char *ts_symbol_names[] = {
"factor",
"token2",
"times",
"token1",
"variable",
"term",
"plus",
"expression",
"token2",
"number",
"variable",
"plus",
"times",
"term",
"expression",
"__END__",
};
@ -73,10 +73,10 @@ static void ts_lex(TSParser *parser) {
ADVANCE(8);
LEX_ERROR(2, EXPECT({"')'", "'+'"}));
case 10:
if (LOOKAHEAD_CHAR() == '(')
ADVANCE(12);
if (isalnum(LOOKAHEAD_CHAR()))
ADVANCE(13);
if (LOOKAHEAD_CHAR() == '(')
ADVANCE(12);
if (isdigit(LOOKAHEAD_CHAR()))
ADVANCE(11);
LEX_ERROR(3, EXPECT({"<word>", "'('", "<digit>"}));
@ -118,18 +118,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(52);
case ts_symbol_variable:
SHIFT(47);
case ts_aux_token1:
SHIFT(49);
case ts_symbol_number:
SHIFT(48);
case ts_symbol_variable:
SHIFT(47);
case ts_symbol_term:
SHIFT(2);
case ts_symbol_expression:
SHIFT(1);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "token1", "term", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"expression", "term", "variable", "number", "token1", "factor"}));
}
case 1:
SET_LEX_STATE(0);
@ -152,10 +152,10 @@ static TSParseResult ts_parse(const char *input) {
case 3:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(7);
case ts_symbol_factor:
SHIFT(39);
case ts_aux_token1:
SHIFT(7);
case ts_symbol_number:
SHIFT(6);
case ts_symbol_variable:
@ -163,7 +163,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_symbol_term:
SHIFT(4);
default:
PARSE_ERROR(5, EXPECT({"term", "variable", "number", "factor", "token1"}));
PARSE_ERROR(5, EXPECT({"term", "variable", "number", "token1", "factor"}));
}
case 4:
SET_LEX_STATE(0);
@ -198,18 +198,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(37);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(37);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 8:
SET_LEX_STATE(9);
@ -224,10 +224,10 @@ static TSParseResult ts_parse(const char *input) {
case 9:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(13);
case ts_symbol_factor:
SHIFT(29);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
@ -235,7 +235,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_symbol_term:
SHIFT(10);
default:
PARSE_ERROR(5, EXPECT({"term", "variable", "number", "factor", "token1"}));
PARSE_ERROR(5, EXPECT({"term", "variable", "number", "token1", "factor"}));
}
case 10:
SET_LEX_STATE(4);
@ -270,18 +270,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(27);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(27);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 14:
SET_LEX_STATE(7);
@ -312,18 +312,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(17);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(17);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 17:
SET_LEX_STATE(4);
@ -348,28 +348,28 @@ static TSParseResult ts_parse(const char *input) {
case 19:
SET_LEX_STATE(7);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_plus:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_aux_token2:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_symbol_plus:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_symbol_times:
SHIFT(20);
default:
PARSE_ERROR(3, EXPECT({"times", "token2", "plus"}));
PARSE_ERROR(3, EXPECT({"times", "plus", "token2"}));
}
case 20:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(23);
case ts_symbol_factor:
SHIFT(26);
case ts_aux_token1:
SHIFT(23);
case ts_symbol_number:
SHIFT(22);
case ts_symbol_variable:
SHIFT(21);
default:
PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"}));
PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"}));
}
case 21:
SET_LEX_STATE(9);
@ -396,18 +396,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(24);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(24);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 24:
SET_LEX_STATE(4);
@ -468,16 +468,16 @@ static TSParseResult ts_parse(const char *input) {
case 30:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(33);
case ts_symbol_factor:
SHIFT(36);
case ts_aux_token1:
SHIFT(33);
case ts_symbol_number:
SHIFT(32);
case ts_symbol_variable:
SHIFT(31);
default:
PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"}));
PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"}));
}
case 31:
SET_LEX_STATE(4);
@ -500,18 +500,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(34);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(34);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 34:
SET_LEX_STATE(4);
@ -568,16 +568,16 @@ static TSParseResult ts_parse(const char *input) {
case 40:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(43);
case ts_symbol_factor:
SHIFT(46);
case ts_aux_token1:
SHIFT(43);
case ts_symbol_number:
SHIFT(42);
case ts_symbol_variable:
SHIFT(41);
default:
PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"}));
PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"}));
}
case 41:
SET_LEX_STATE(0);
@ -600,18 +600,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(44);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(44);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 44:
SET_LEX_STATE(4);
@ -666,18 +666,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(50);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(50);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 50:
SET_LEX_STATE(4);
@ -714,16 +714,16 @@ static TSParseResult ts_parse(const char *input) {
case 53:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token1:
SHIFT(56);
case ts_symbol_factor:
SHIFT(59);
case ts_aux_token1:
SHIFT(56);
case ts_symbol_number:
SHIFT(55);
case ts_symbol_variable:
SHIFT(54);
default:
PARSE_ERROR(4, EXPECT({"variable", "number", "factor", "token1"}));
PARSE_ERROR(4, EXPECT({"variable", "number", "token1", "factor"}));
}
case 54:
SET_LEX_STATE(14);
@ -750,18 +750,18 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(19);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_expression:
SHIFT(57);
case ts_aux_token1:
SHIFT(16);
case ts_symbol_number:
SHIFT(15);
case ts_symbol_expression:
SHIFT(57);
case ts_symbol_variable:
SHIFT(14);
case ts_symbol_term:
SHIFT(8);
default:
PARSE_ERROR(6, EXPECT({"term", "variable", "token1", "expression", "number", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "variable", "number", "expression", "token1", "factor"}));
}
case 57:
SET_LEX_STATE(4);

View file

@ -2,39 +2,39 @@
#include <ctype.h>
enum ts_symbol {
ts_symbol_array,
ts_aux_token6,
ts_aux_repeat_helper2,
ts_aux_token5,
ts_symbol_string,
ts_symbol_value,
ts_symbol_object,
ts_aux_token4,
ts_aux_token7,
ts_symbol_number,
ts_aux_token2,
ts_aux_token3,
ts_aux_token1,
ts_aux_repeat_helper1,
ts_aux_token5,
ts_aux_repeat_helper2,
ts_symbol_object,
ts_aux_token6,
ts_aux_token7,
ts_aux_token4,
ts_aux_token1,
ts_symbol_array,
ts_symbol___END__,
ts_symbol_value,
ts_symbol_number,
ts_aux_token3,
ts_aux_token2,
};
static const char *ts_symbol_names[] = {
"array",
"token6",
"repeat_helper2",
"token5",
"string",
"value",
"object",
"token4",
"token7",
"number",
"token2",
"token3",
"token1",
"repeat_helper1",
"token5",
"repeat_helper2",
"object",
"token6",
"token7",
"token4",
"token1",
"array",
"__END__",
"value",
"number",
"token3",
"token2",
};
static void ts_lex(TSParser *parser) {
@ -77,29 +77,29 @@ static void ts_lex(TSParser *parser) {
ADVANCE(3);
LEX_ERROR(2, EXPECT({"'}'", "','"}));
case 10:
if (LOOKAHEAD_CHAR() == '{')
ADVANCE(16);
if (LOOKAHEAD_CHAR() == '[')
ADVANCE(15);
if (LOOKAHEAD_CHAR() == '\"')
ADVANCE(12);
if (LOOKAHEAD_CHAR() == '{')
ADVANCE(16);
if (isdigit(LOOKAHEAD_CHAR()))
ADVANCE(11);
LEX_ERROR(4, EXPECT({"'['", "'\"'", "'{'", "<digit>"}));
LEX_ERROR(4, EXPECT({"'{'", "'['", "'\"'", "<digit>"}));
case 11:
if (isdigit(LOOKAHEAD_CHAR()))
ADVANCE(11);
ACCEPT_TOKEN(ts_symbol_number);
case 12:
if (isalnum(LOOKAHEAD_CHAR()))
if (!(LOOKAHEAD_CHAR() == '\"'))
ADVANCE(13);
LEX_ERROR(1, EXPECT({"<word>"}));
LEX_ERROR(1, EXPECT({"'\"'"}));
case 13:
if (LOOKAHEAD_CHAR() == '\"')
ADVANCE(14);
if (isalnum(LOOKAHEAD_CHAR()))
if (!(LOOKAHEAD_CHAR() == '\"'))
ADVANCE(13);
LEX_ERROR(2, EXPECT({"'\"'", "<word>"}));
LEX_ERROR(1, EXPECT({"'\"'"}));
case 14:
ACCEPT_TOKEN(ts_symbol_string);
case 15:
@ -788,14 +788,14 @@ static TSParseResult ts_parse(const char *input) {
case 59:
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token2:
SHIFT(9);
case ts_aux_token4:
REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0}));
case ts_aux_token2:
SHIFT(9);
case ts_aux_repeat_helper2:
SHIFT(60);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper2", "token4", "token2"}));
PARSE_ERROR(3, EXPECT({"repeat_helper2", "token2", "token4"}));
}
case 60:
SET_LEX_STATE(4);

View file

@ -14,7 +14,7 @@ describe("json", []() {
});
it("parses strings", [&]() {
TSDocumentSetText(document, "\"string\"");
TSDocumentSetText(document, "\"this is a string\"");
AssertThat(string(TSDocumentToString(document)), Equals("(value (string))"));
});

View file

@ -38,6 +38,18 @@ namespace tree_sitter {
return result;
}
rule_ptr char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
std::vector<CharacterMatch> matches;
while (has_more_input() && (peek() != ']'))
matches.push_back(single_char());
return character(matches, is_affirmative);
}
rule_ptr atom() {
rule_ptr result;
switch (peek()) {
@ -49,34 +61,52 @@ namespace tree_sitter {
else
next();
break;
case '[':
next();
result = char_set();
if (peek() != ']')
error("mismatched square brackets");
else
next();
break;
case ')':
error("mismatched parens");
break;
case '\\':
next();
result = escaped_char(peek());
next();
break;
default:
result = character(peek());
next();
break;
result = character({ single_char() }, true);
}
return result;
}
rule_ptr escaped_char(char value) {
CharacterMatch single_char() {
CharacterMatch value('\0');
switch (peek()) {
case '\\':
next();
value = escaped_char(peek());
next();
break;
default:
value = peek();
next();
return value;
}
return value;
}
CharacterMatch escaped_char(char value) {
switch (value) {
case '\\':
case '(':
case ')':
return character(value);
return value;
case 'w':
return character(CharClassWord);
return CharClassWord;
case 'd':
return character(CharClassDigit);
return CharClassDigit;
default:
error("unrecognized escape sequence");
return rule_ptr();
return '\0';
}
}