Represent character sets as sets of character ranges

This commit is contained in:
Max Brunsfeld 2014-02-05 18:56:04 -08:00
parent 8cce11a52a
commit d3d25f2683
17 changed files with 551 additions and 499 deletions

51
character_set_spec.cpp Normal file
View file

@ -0,0 +1,51 @@
#include "spec_helper.h"
#include "rules.h"
using namespace rules;
START_TEST
describe("character sets", []() {
describe("computing the complement", []() {
it("works for the set containing only the null character", []() {
CharacterSet set1({ '\0' });
auto set2 = set1.complement();
AssertThat(set2, Equals(CharacterSet({
{ 1, -1 },
}, true)));
AssertThat(set2.complement(), Equals(set1));
});
it("works for single character sets", []() {
CharacterSet set1({ 'b' });
auto set2 = set1.complement();
AssertThat(set2, Equals(CharacterSet({
{ 0, 'a' },
{ 'c', -1 },
})));
AssertThat(set2.complement(), Equals(set1));
});
});
describe("computing unions", []() {
it("works for disjoint sets", []() {
CharacterSet set({ {'a', 'z'} }, true);
set.union_with(CharacterSet({ {'A', 'Z'} }, true));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'}, })));
});
it("works for sets with adjacent ranges", []() {
CharacterSet set({ {'a', 'r'} }, true);
set.union_with(CharacterSet({ {'s', 'z'} }, true));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true)));
});
it("works when the result becomes a continuous range", []() {
CharacterSet set({ {'a', 'd'}, {'f', 'z'} }, true);
set.union_with(CharacterSet({ {'d', 'f'} }, true));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true)));
});
});
});
END_TEST

View file

@ -16,14 +16,6 @@ static unordered_set<Symbol> keys(const unordered_map<Symbol, parse_actions> &ma
return result;
}
static unordered_set<CharacterSet> keys(const unordered_map<CharacterSet, lex_actions> &map) {
unordered_set<CharacterSet> result;
for (auto pair : map) {
result.insert(pair.first);
}
return result;
}
START_TEST
describe("building parse and lex tables", []() {
@ -79,16 +71,10 @@ describe("building parse and lex tables", []() {
Symbol("left-paren"),
})));
AssertThat(keys(lex_state(0).actions), Equals(unordered_set<CharacterSet>({
CharacterSet('('),
CharacterSet(CharClassDigit),
CharacterSet(CharClassWord),
})));
AssertThat(lex_state(0).expected_inputs(), Equals(unordered_set<CharacterSet>({
CharacterSet('('),
CharacterSet(CharClassDigit),
CharacterSet(CharClassWord),
CharacterSet({ '(' }, true),
CharacterSet({ {'0', '9'} }, true),
CharacterSet({ {'a', 'z'}, {'A', 'Z'} }, true),
})));
});

View file

@ -29,15 +29,6 @@ describe("rule transitions", []() {
})));
});
it("handles character classes", [&]() {
auto rule = character(CharClassDigit);
AssertThat(
rule_transitions(rule),
Equals(transition_map<Rule, Rule>({
{ rule, blank() }
})));
});
it("handles choices", [&]() {
AssertThat(
rule_transitions(choice({ symbol1, symbol2 })),

View file

@ -22,9 +22,9 @@ describe("parsing pattern rules", []() {
AssertThat(
rule.to_rule_tree(),
EqualsPointer(seq({
character(CharClassWord),
character({ {'a', 'z'}, {'A', 'Z'} }),
character('-'),
character(CharClassDigit)
character({ {'0', '9'} })
})));
});
@ -49,24 +49,24 @@ describe("parsing pattern rules", []() {
});
it("parses character sets", []() {
Pattern rule("[abc]");
Pattern rule("[aAeE]");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character({ 'a', 'b', 'c' }, true)));
EqualsPointer(character({ 'a', 'A', 'e', 'E' })));
});
it("parses character ranges", []() {
Pattern rule("[12a-dA-D3]");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character({ '1', '2', CharacterRange({'a', 'd'}), CharacterRange({ 'A', 'D' }), '3' }, true)));
EqualsPointer(character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, })));
});
it("parses negated characters", []() {
Pattern rule("[^a\\d]");
AssertThat(
rule.to_rule_tree(),
EqualsPointer(character({ 'a', CharClassDigit }, false)));
EqualsPointer(character({ {'a'}, {'0', '9'} }, false)));
});
it("parses backslashes", []() {

View file

@ -2,28 +2,28 @@
#include <ctype.h>
enum ts_symbol {
ts_symbol_plus,
ts_symbol_number,
ts_symbol_variable,
ts_symbol_factor,
ts_symbol_times,
ts_aux_token1,
ts_symbol_term,
ts_symbol_expression,
ts_symbol_plus,
ts_aux_token2,
ts_symbol_variable,
ts_symbol_times,
ts_symbol_factor,
ts_symbol_term,
ts_symbol_number,
ts_symbol_expression,
ts_symbol___END__,
};
static const char *ts_symbol_names[] = {
"plus",
"number",
"variable",
"factor",
"times",
"token1",
"term",
"expression",
"plus",
"token2",
"variable",
"times",
"factor",
"term",
"number",
"expression",
"__END__",
};
@ -33,7 +33,7 @@ static void ts_lex(TSParser *parser) {
case 0:
if ((LOOKAHEAD_CHAR() == '\0'))
ADVANCE(1);
LEX_ERROR(1, EXPECT({"<EOF>"}));
LEX_ERROR(1, EXPECT({"\0"}));
case 1:
ACCEPT_TOKEN(ts_symbol___END__);
case 2:
@ -41,13 +41,13 @@ static void ts_lex(TSParser *parser) {
ADVANCE(3);
if ((LOOKAHEAD_CHAR() == '\0'))
ADVANCE(1);
LEX_ERROR(2, EXPECT({"'*'", "<EOF>"}));
LEX_ERROR(2, EXPECT({"\0", "*"}));
case 3:
ACCEPT_TOKEN(ts_symbol_times);
case 4:
if ((LOOKAHEAD_CHAR() == ')'))
ADVANCE(5);
LEX_ERROR(1, EXPECT({"')'"}));
LEX_ERROR(1, EXPECT({")"}));
case 5:
ACCEPT_TOKEN(ts_aux_token2);
case 6:
@ -55,7 +55,7 @@ static void ts_lex(TSParser *parser) {
ADVANCE(5);
if ((LOOKAHEAD_CHAR() == '*'))
ADVANCE(3);
LEX_ERROR(2, EXPECT({"')'", "'*'"}));
LEX_ERROR(1, EXPECT({")-*"}));
case 7:
if ((LOOKAHEAD_CHAR() == ')'))
ADVANCE(5);
@ -63,7 +63,7 @@ static void ts_lex(TSParser *parser) {
ADVANCE(3);
if ((LOOKAHEAD_CHAR() == '+'))
ADVANCE(8);
LEX_ERROR(3, EXPECT({"')'", "'*'", "'+'"}));
LEX_ERROR(1, EXPECT({")-+"}));
case 8:
ACCEPT_TOKEN(ts_symbol_plus);
case 9:
@ -71,18 +71,18 @@ static void ts_lex(TSParser *parser) {
ADVANCE(5);
if ((LOOKAHEAD_CHAR() == '+'))
ADVANCE(8);
LEX_ERROR(2, EXPECT({"')'", "'+'"}));
LEX_ERROR(2, EXPECT({")", "+"}));
case 10:
if ((LOOKAHEAD_CHAR() == '('))
ADVANCE(12);
if (('A' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'Z') ||
('a' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'z'))
ADVANCE(13);
if ((isdigit(LOOKAHEAD_CHAR())))
if ((LOOKAHEAD_CHAR() == '('))
ADVANCE(12);
if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
ADVANCE(11);
LEX_ERROR(4, EXPECT({"'A'-'Z'", "'('", "'a'-'z'", "<digit>"}));
LEX_ERROR(4, EXPECT({"(", "0-9", "A-Z", "a-z"}));
case 11:
if ((isdigit(LOOKAHEAD_CHAR())))
if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
ADVANCE(11);
ACCEPT_TOKEN(ts_symbol_number);
case 12:
@ -97,7 +97,7 @@ static void ts_lex(TSParser *parser) {
ADVANCE(8);
if ((LOOKAHEAD_CHAR() == '\0'))
ADVANCE(1);
LEX_ERROR(2, EXPECT({"'+'", "<EOF>"}));
LEX_ERROR(2, EXPECT({"\0", "+"}));
case 15:
if ((LOOKAHEAD_CHAR() == '*'))
ADVANCE(3);
@ -105,7 +105,7 @@ static void ts_lex(TSParser *parser) {
ADVANCE(8);
if ((LOOKAHEAD_CHAR() == '\0'))
ADVANCE(1);
LEX_ERROR(3, EXPECT({"'*'", "'+'", "<EOF>"}));
LEX_ERROR(2, EXPECT({"\0", "*-+"}));
default:
LEX_PANIC();
}
@ -118,20 +118,20 @@ static TSParseResult ts_parse(const char *input) {
case 0:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(45);
case ts_symbol_term:
SHIFT(2);
case ts_aux_token1:
SHIFT(42);
case ts_symbol_expression:
SHIFT(1);
case ts_symbol_number:
SHIFT(41);
case ts_symbol_variable:
SHIFT(41);
case ts_symbol_expression:
SHIFT(1);
case ts_symbol_factor:
SHIFT(45);
case ts_symbol_term:
SHIFT(2);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 1:
SET_LEX_STATE(0);
@ -188,20 +188,20 @@ static TSParseResult ts_parse(const char *input) {
case 6:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(32);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(32);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 7:
SET_LEX_STATE(9);
@ -250,50 +250,50 @@ static TSParseResult ts_parse(const char *input) {
case 11:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(23);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(23);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 12:
SET_LEX_STATE(7);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token2:
REDUCE(ts_symbol_factor, 1, COLLAPSE({0}));
case ts_symbol_times:
REDUCE(ts_symbol_factor, 1, COLLAPSE({0}));
case ts_aux_token2:
REDUCE(ts_symbol_factor, 1, COLLAPSE({0}));
case ts_symbol_plus:
REDUCE(ts_symbol_factor, 1, COLLAPSE({0}));
default:
PARSE_ERROR(3, EXPECT({"plus", "times", "token2"}));
PARSE_ERROR(3, EXPECT({"plus", "token2", "times"}));
}
case 13:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(14);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(14);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 14:
SET_LEX_STATE(4);
@ -306,14 +306,14 @@ static TSParseResult ts_parse(const char *input) {
case 15:
SET_LEX_STATE(7);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token2:
REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1}));
case ts_symbol_times:
REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1}));
case ts_aux_token2:
REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1}));
case ts_symbol_plus:
REDUCE(ts_symbol_factor, 3, COLLAPSE({1, 0, 1}));
default:
PARSE_ERROR(3, EXPECT({"plus", "times", "token2"}));
PARSE_ERROR(3, EXPECT({"plus", "token2", "times"}));
}
case 16:
SET_LEX_STATE(7);
@ -354,20 +354,20 @@ static TSParseResult ts_parse(const char *input) {
case 19:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(20);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(20);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 20:
SET_LEX_STATE(4);
@ -450,20 +450,20 @@ static TSParseResult ts_parse(const char *input) {
case 28:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(29);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(29);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 29:
SET_LEX_STATE(4);
@ -542,20 +542,20 @@ static TSParseResult ts_parse(const char *input) {
case 37:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(38);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(38);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 38:
SET_LEX_STATE(4);
@ -596,20 +596,20 @@ static TSParseResult ts_parse(const char *input) {
case 42:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(43);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(43);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 43:
SET_LEX_STATE(4);
@ -634,14 +634,14 @@ static TSParseResult ts_parse(const char *input) {
case 45:
SET_LEX_STATE(15);
switch (LOOKAHEAD_SYM()) {
case ts_symbol___END__:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_symbol_plus:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_symbol___END__:
REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
case ts_symbol_times:
SHIFT(46);
default:
PARSE_ERROR(3, EXPECT({"times", "plus", "__END__"}));
PARSE_ERROR(3, EXPECT({"times", "__END__", "plus"}));
}
case 46:
SET_LEX_STATE(10);
@ -670,20 +670,20 @@ static TSParseResult ts_parse(const char *input) {
case 48:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
case ts_aux_token1:
SHIFT(13);
case ts_symbol_expression:
SHIFT(49);
case ts_symbol_number:
SHIFT(12);
case ts_symbol_variable:
SHIFT(12);
case ts_symbol_expression:
SHIFT(49);
case ts_symbol_factor:
SHIFT(16);
case ts_symbol_term:
SHIFT(7);
default:
PARSE_ERROR(6, EXPECT({"expression", "variable", "number", "token1", "term", "factor"}));
PARSE_ERROR(6, EXPECT({"term", "factor", "variable", "number", "expression", "token1"}));
}
case 49:
SET_LEX_STATE(4);

View file

@ -2,38 +2,38 @@
#include <ctype.h>
enum ts_symbol {
ts_symbol_string,
ts_symbol_array,
ts_aux_token3,
ts_symbol_object,
ts_aux_repeat_helper1,
ts_aux_token6,
ts_symbol_number,
ts_aux_repeat_helper1,
ts_aux_token7,
ts_aux_token5,
ts_aux_token4,
ts_symbol___END__,
ts_aux_token2,
ts_aux_repeat_helper2,
ts_aux_token5,
ts_symbol_value,
ts_symbol___END__,
ts_aux_token3,
ts_aux_token2,
ts_symbol_string,
ts_symbol_object,
ts_aux_token4,
ts_symbol_array,
ts_aux_token1,
};
static const char *ts_symbol_names[] = {
"string",
"array",
"token3",
"object",
"repeat_helper1",
"token6",
"number",
"repeat_helper1",
"token7",
"token5",
"token4",
"__END__",
"token2",
"repeat_helper2",
"token5",
"value",
"__END__",
"token3",
"token2",
"string",
"object",
"token4",
"array",
"token1",
};
@ -43,7 +43,7 @@ static void ts_lex(TSParser *parser) {
case 0:
if ((LOOKAHEAD_CHAR() == '\0'))
ADVANCE(1);
LEX_ERROR(1, EXPECT({"<EOF>"}));
LEX_ERROR(1, EXPECT({"\0"}));
case 1:
ACCEPT_TOKEN(ts_symbol___END__);
case 2:
@ -55,7 +55,7 @@ static void ts_lex(TSParser *parser) {
case 4:
if ((LOOKAHEAD_CHAR() == ']'))
ADVANCE(5);
LEX_ERROR(1, EXPECT({"']'"}));
LEX_ERROR(1, EXPECT({"]"}));
case 5:
ACCEPT_TOKEN(ts_aux_token4);
case 6:
@ -63,11 +63,11 @@ static void ts_lex(TSParser *parser) {
ADVANCE(5);
if ((LOOKAHEAD_CHAR() == ','))
ADVANCE(3);
LEX_ERROR(2, EXPECT({"']'", "','"}));
LEX_ERROR(2, EXPECT({",", "]"}));
case 7:
if ((LOOKAHEAD_CHAR() == '}'))
ADVANCE(8);
LEX_ERROR(1, EXPECT({"'}'"}));
LEX_ERROR(1, EXPECT({"}"}));
case 8:
ACCEPT_TOKEN(ts_aux_token7);
case 9:
@ -75,31 +75,31 @@ static void ts_lex(TSParser *parser) {
ADVANCE(8);
if ((LOOKAHEAD_CHAR() == ','))
ADVANCE(3);
LEX_ERROR(2, EXPECT({"'}'", "','"}));
LEX_ERROR(2, EXPECT({",", "}"}));
case 10:
if ((LOOKAHEAD_CHAR() == '{'))
ADVANCE(16);
if ((LOOKAHEAD_CHAR() == '['))
ADVANCE(15);
if ((LOOKAHEAD_CHAR() == '{'))
ADVANCE(16);
if ((LOOKAHEAD_CHAR() == '\"'))
ADVANCE(12);
if ((isdigit(LOOKAHEAD_CHAR())))
if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
ADVANCE(11);
LEX_ERROR(4, EXPECT({"'{'", "'['", "'\"'", "<digit>"}));
LEX_ERROR(4, EXPECT({"\"", "0-9", "[", "{"}));
case 11:
if ((isdigit(LOOKAHEAD_CHAR())))
if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
ADVANCE(11);
ACCEPT_TOKEN(ts_symbol_number);
case 12:
if (!((LOOKAHEAD_CHAR() == '\"')))
ADVANCE(13);
LEX_ERROR(1, EXPECT({"'\"'"}));
LEX_ERROR(1, EXPECT({"<-!"}));
case 13:
if ((LOOKAHEAD_CHAR() == '\"'))
ADVANCE(14);
if (!((LOOKAHEAD_CHAR() == '\"')))
ADVANCE(13);
LEX_ERROR(1, EXPECT({"'\"'"}));
LEX_ERROR(1, EXPECT({"<-\""}));
case 14:
ACCEPT_TOKEN(ts_symbol_string);
case 15:
@ -109,13 +109,13 @@ static void ts_lex(TSParser *parser) {
case 17:
if ((LOOKAHEAD_CHAR() == ':'))
ADVANCE(18);
LEX_ERROR(1, EXPECT({"':'"}));
LEX_ERROR(1, EXPECT({":"}));
case 18:
ACCEPT_TOKEN(ts_aux_token6);
case 19:
if ((LOOKAHEAD_CHAR() == '\"'))
ADVANCE(12);
LEX_ERROR(1, EXPECT({"'\"'"}));
LEX_ERROR(1, EXPECT({"\""}));
default:
LEX_PANIC();
}
@ -128,22 +128,22 @@ static TSParseResult ts_parse(const char *input) {
case 0:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(53);
case ts_symbol_string:
SHIFT(53);
case ts_aux_token1:
SHIFT(2);
case ts_symbol_object:
SHIFT(53);
case ts_symbol_value:
SHIFT(1);
case ts_aux_token5:
SHIFT(47);
case ts_symbol_number:
SHIFT(53);
case ts_symbol_array:
SHIFT(53);
case ts_symbol_object:
SHIFT(53);
case ts_aux_token5:
SHIFT(47);
case ts_aux_token1:
SHIFT(2);
case ts_symbol_value:
SHIFT(1);
default:
PARSE_ERROR(7, EXPECT({"value", "token1", "array", "string", "token5", "object", "number"}));
PARSE_ERROR(7, EXPECT({"array", "number", "token5", "value", "object", "token1", "string"}));
}
case 1:
SET_LEX_STATE(0);
@ -156,42 +156,42 @@ static TSParseResult ts_parse(const char *input) {
case 2:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(44);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 3:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(4);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 4:
SET_LEX_STATE(2);
@ -226,42 +226,42 @@ static TSParseResult ts_parse(const char *input) {
case 7:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(43);
case ts_symbol_string:
SHIFT(43);
case ts_aux_token5:
SHIFT(35);
case ts_aux_token1:
SHIFT(8);
case ts_symbol_object:
SHIFT(43);
case ts_symbol_number:
SHIFT(43);
case ts_symbol_array:
SHIFT(43);
case ts_symbol_value:
SHIFT(41);
case ts_aux_token1:
SHIFT(8);
case ts_aux_token5:
SHIFT(35);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 8:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(9);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 9:
SET_LEX_STATE(2);
@ -312,34 +312,34 @@ static TSParseResult ts_parse(const char *input) {
case 14:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(15);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 15:
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token3:
SHIFT(16);
case ts_aux_token2:
SHIFT(18);
case ts_aux_token3:
SHIFT(16);
case ts_aux_repeat_helper1:
SHIFT(16);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"}));
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"}));
}
case 16:
SET_LEX_STATE(7);
@ -378,42 +378,42 @@ static TSParseResult ts_parse(const char *input) {
case 20:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(34);
case ts_symbol_string:
SHIFT(34);
case ts_aux_token5:
SHIFT(26);
case ts_aux_token1:
SHIFT(21);
case ts_symbol_object:
SHIFT(34);
case ts_symbol_number:
SHIFT(34);
case ts_symbol_array:
SHIFT(34);
case ts_symbol_value:
SHIFT(32);
case ts_aux_token1:
SHIFT(21);
case ts_aux_token5:
SHIFT(26);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 21:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(22);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 22:
SET_LEX_STATE(2);
@ -474,34 +474,34 @@ static TSParseResult ts_parse(const char *input) {
case 28:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(29);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 29:
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token3:
SHIFT(30);
case ts_aux_token2:
SHIFT(18);
case ts_aux_token3:
SHIFT(30);
case ts_aux_repeat_helper1:
SHIFT(30);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"}));
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"}));
}
case 30:
SET_LEX_STATE(7);
@ -570,34 +570,34 @@ static TSParseResult ts_parse(const char *input) {
case 37:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(38);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 38:
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token3:
SHIFT(39);
case ts_aux_token2:
SHIFT(18);
case ts_aux_token3:
SHIFT(39);
case ts_aux_repeat_helper1:
SHIFT(39);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"}));
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"}));
}
case 39:
SET_LEX_STATE(7);
@ -620,14 +620,14 @@ static TSParseResult ts_parse(const char *input) {
case 41:
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token4:
REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0}));
case ts_aux_token2:
SHIFT(7);
case ts_aux_token4:
REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0}));
case ts_aux_repeat_helper2:
SHIFT(42);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper2", "token2", "token4"}));
PARSE_ERROR(3, EXPECT({"repeat_helper2", "token4", "token2"}));
}
case 42:
SET_LEX_STATE(4);
@ -694,34 +694,34 @@ static TSParseResult ts_parse(const char *input) {
case 49:
SET_LEX_STATE(10);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_number:
SHIFT(25);
case ts_symbol_string:
SHIFT(25);
case ts_aux_token5:
SHIFT(12);
case ts_aux_token1:
SHIFT(3);
case ts_symbol_object:
SHIFT(25);
case ts_symbol_number:
SHIFT(25);
case ts_symbol_array:
SHIFT(25);
case ts_symbol_value:
SHIFT(50);
case ts_aux_token1:
SHIFT(3);
case ts_aux_token5:
SHIFT(12);
default:
PARSE_ERROR(7, EXPECT({"token1", "value", "array", "string", "object", "token5", "number"}));
PARSE_ERROR(7, EXPECT({"token5", "value", "array", "number", "object", "token1", "string"}));
}
case 50:
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_token3:
SHIFT(51);
case ts_aux_token2:
SHIFT(18);
case ts_aux_token3:
SHIFT(51);
case ts_aux_repeat_helper1:
SHIFT(51);
default:
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token2", "token3"}));
PARSE_ERROR(3, EXPECT({"repeat_helper1", "token3", "token2"}));
}
case 51:
SET_LEX_STATE(7);

View file

@ -1,7 +1,7 @@
#ifndef __tree_sitter__item_set_transitions__
#define __tree_sitter__item_set_transitions__
#include "character.h"
#include "character_set.h"
#include "symbol.h"
#include "transition_map.h"
#include "item.h"

View file

@ -8,6 +8,8 @@ using std::to_string;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using std::set;
using std::pair;
namespace tree_sitter {
namespace generate_code {
@ -101,33 +103,30 @@ namespace tree_sitter {
}
}
string condition_for_character_match(const rules::CharacterRange &match) {
string condition_for_character_range(const rules::CharacterRange &range) {
string lookahead("LOOKAHEAD_CHAR()");
auto value = match.value;
switch (match.type) {
case rules::CharacterRangeTypeClass:
switch (value.character_class) {
case rules::CharClassDigit:
return string("isdigit(") + lookahead + ")";
case rules::CharClassWord:
return string("isalnum(") + lookahead + ")";
}
case rules::CharacterRangeTypeSpecific:
return lookahead + " == '" + character_code(value.character) + "'";
case rules::CharacterRangeTypeRange:
return string("'") + value.range.min_character + string("' <= ") + lookahead +
" && " + lookahead + " <= '" + value.range.max_character + "'";
if (range.min == range.max) {
return lookahead + " == '" + character_code(range.min) + "'";
} else {
return string("'") + range.min + string("' <= ") + lookahead +
" && " + lookahead + " <= '" + range.max + "'";
}
}
string condition_for_character_set(const rules::CharacterSet &set) {
vector<string> parts;
for (auto &match : set.ranges)
parts.push_back("(" + condition_for_character_range(match) + ")");
return join(parts, " ||\n ");
}
string condition_for_character_rule(const rules::CharacterSet &rule) {
vector<string> parts;
for (auto &match : rule.ranges) {
parts.push_back("(" + condition_for_character_match(match) + ")");
}
string result = join(parts, " ||\n ");
if (!rule.sign) result = "!(" + result + ")";
return result;
pair<rules::CharacterSet, bool> representation = rule.most_compact_representation();
if (representation.second)
return condition_for_character_set(representation.first);
else
return "!(" + condition_for_character_set(rule.complement()) + ")";
}
string collapse_flags(vector<bool> flags) {
@ -177,17 +176,16 @@ namespace tree_sitter {
}
string lex_error_call(const unordered_set<rules::CharacterSet> &expected_inputs) {
unordered_set<rules::CharacterRange> expected_matches;
rules::CharacterSet expected_set;
for (auto &rule : expected_inputs)
for (auto &match : rule.ranges)
expected_matches.insert(match);
expected_set.union_with(rule);
string result = "LEX_ERROR(" + to_string(expected_matches.size()) + ", EXPECT({";
string result = "LEX_ERROR(" + to_string(expected_set.ranges.size()) + ", EXPECT({";
bool started = false;
for (auto match : expected_matches) {
for (auto &ranges : expected_set.ranges) {
if (started) result += ", ";
started = true;
result += "\"" + escape_string(match.to_string()) + "\"";
result += "\"" + escape_string(ranges.to_string()) + "\"";
}
result += "}));";
return result;

View file

@ -6,7 +6,7 @@
#include <string>
#include <unordered_set>
#include "symbol.h"
#include "character.h"
#include "character_set.h"
namespace tree_sitter {
typedef enum {

View file

@ -1,84 +0,0 @@
#include "rules.h"
using std::string;
using std::hash;
namespace tree_sitter {
namespace rules {
CharacterRange::CharacterRange(char character) : type(CharacterRangeTypeSpecific) { value.character = character; }
CharacterRange::CharacterRange(CharacterClass klass) : type(CharacterRangeTypeClass) { value.character_class = klass; }
CharacterRange::CharacterRange(const std::pair<char, char> bounds) : type(CharacterRangeTypeRange) {
value.range.min_character = bounds.first;
value.range.max_character = bounds.second;
}
bool CharacterRange::operator==(const CharacterRange &right) const {
if (type != right.type)
return false;
switch (type) {
case CharacterRangeTypeClass:
return (value.character_class == right.value.character_class);
case CharacterRangeTypeSpecific:
return (value.character == right.value.character);
case CharacterRangeTypeRange:
return (value.range.min_character == right.value.range.min_character &&
value.range.max_character == right.value.range.max_character);
}
}
string CharacterRange::to_string() const {
switch (type) {
case CharacterRangeTypeClass:
switch (value.character_class) {
case CharClassDigit:
return "<digit>";
case CharClassWord:
return "<word>";
}
case CharacterRangeTypeSpecific:
return (value.character == '\0') ?
"<EOF>" :
string("'") + value.character + "'";
case CharacterRangeTypeRange:
return (string("'") +
value.range.min_character + "'-'" +
value.range.max_character + "'");
}
}
CharacterSet::CharacterSet(char character) : ranges({ CharacterRange(character) }), sign(true) {}
CharacterSet::CharacterSet(CharacterClass char_class) : ranges({ CharacterRange(char_class) }), sign(true) {}
CharacterSet::CharacterSet(const std::unordered_set<CharacterRange> &ranges, bool sign) : ranges(ranges), sign(sign) {}
bool CharacterSet::operator==(const Rule &rule) const {
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
return other && this->operator==(*other);
}
bool CharacterSet::operator==(const CharacterSet &other) const {
if (other.sign != sign) return false;
if (other.ranges != ranges) return false;
return true;
}
size_t CharacterSet::hash_code() const {
return typeid(this).hash_code() ^ hash<string>()(to_string());
}
rule_ptr CharacterSet::copy() const {
return std::make_shared<CharacterSet>(*this);
}
string CharacterSet::to_string() const {
string prefix("#<char");
if (!sign) prefix += " (not)";
for (auto &range : ranges)
prefix += " " + range.to_string();
return prefix + ">";
}
void CharacterSet::accept(Visitor &visitor) const {
visitor.visit(this);
}
}
}

View file

@ -1,88 +0,0 @@
#ifndef __tree_sitter__character__
#define __tree_sitter__character__
#include "rule.h"
#include <unordered_set>
namespace tree_sitter {
namespace rules {
typedef enum {
CharClassWord,
CharClassDigit
} CharacterClass;
typedef enum {
CharacterRangeTypeSpecific,
CharacterRangeTypeClass,
CharacterRangeTypeRange,
} CharacterRangeType;
struct CharacterRange {
CharacterRangeType type;
union {
CharacterClass character_class;
char character;
struct {
char min_character;
char max_character;
} range;
} value;
CharacterRange(char);
CharacterRange(const std::pair<char, char>);
CharacterRange(CharacterClass);
bool operator==(const CharacterRange &) const;
std::string to_string() const;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterRange> {
size_t operator()(const tree_sitter::rules::CharacterRange &match) const {
auto type = match.type;
auto result = hash<short int>()(type);
switch (type) {
case tree_sitter::rules::CharacterRangeTypeClass:
result ^= hash<short int>()(match.value.character_class);
case tree_sitter::rules::CharacterRangeTypeRange:
result ^= hash<char>()(match.value.range.min_character);
result ^= hash<char>()(match.value.range.max_character);
case tree_sitter::rules::CharacterRangeTypeSpecific:
result ^= hash<char>()(match.value.character);
}
return result;
}
};
}
namespace tree_sitter {
namespace rules {
class CharacterSet : public Rule {
public:
CharacterSet(char character);
CharacterSet(CharacterClass character_class);
CharacterSet(char min_character, char max_character);
CharacterSet(const std::unordered_set<CharacterRange> &matches, bool sign);
bool operator==(const Rule& other) const;
bool operator==(const CharacterSet& other) const;
size_t hash_code() const;
rule_ptr copy() const;
std::string to_string() const;
void accept(Visitor &visitor) const;
std::unordered_set<CharacterRange> ranges;
bool sign;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterSet> : hash<tree_sitter::rules::Rule> {};
}
#endif

View file

@ -0,0 +1,128 @@
#include "rules.h"
using std::string;
using std::hash;
using std::set;
namespace tree_sitter {
namespace rules {
char MAX_CHAR = -1;
CharacterRange::CharacterRange(char value) : min(value), max(value) {}
CharacterRange::CharacterRange(char min, char max) : min(min), max(max) {}
bool CharacterRange::operator==(const CharacterRange &other) const {
return min == other.min && max == other.max;
}
bool CharacterRange::operator<(const CharacterRange &other) const {
if (min < other.min) return true;
if (min > other.min) return false;
if (max < other.max) return true;
return false;
}
string escape_character(char input) {
switch (input) {
case '\0':
return "\\0";
default:
return string() + input;
}
}
bool CharacterRange::is_adjacent(const CharacterRange &other) const {
return
(min <= other.min && max >= (other.min - 1)) ||
(min <= (other.max + 1) && max >= other.max);
}
void CharacterRange::add_range(const CharacterRange &other) {
if (other.min < min) min = other.min;
if (other.max > max) max = other.max;
}
string CharacterRange::to_string() const {
if (min == max) {
return escape_character(min);
} else {
if (min == 0)
return string("<-") + max;
else if (max == MAX_CHAR)
return string() + min + "->";
else
return string() + min + "-" + max;
}
}
CharacterSet::CharacterSet() : ranges({}) {}
CharacterSet::CharacterSet(const set<CharacterRange> &ranges) : ranges(ranges) {}
CharacterSet::CharacterSet(const set<CharacterRange> &ranges, bool sign) :
ranges(sign ? ranges : CharacterSet(ranges).complement().ranges) {}
bool CharacterSet::operator==(const Rule &rule) const {
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
return other && (ranges == other->ranges);
}
size_t CharacterSet::hash_code() const {
return typeid(this).hash_code() ^ hash<string>()(to_string());
}
rule_ptr CharacterSet::copy() const {
return std::make_shared<CharacterSet>(*this);
}
string CharacterSet::to_string() const {
string result("#<char {");
for (auto &range : ranges)
result += " " + range.to_string();
return result + " }>";
}
CharacterSet CharacterSet::complement() const {
set<CharacterRange> result;
char current_char = 0;
for (auto &range : ranges) {
if (range.min != 0)
result.insert(CharacterRange(current_char, range.min - 1));
current_char = range.max + 1;
}
if (current_char != 0)
result.insert(CharacterRange(current_char, MAX_CHAR));
return CharacterSet(result);
}
std::pair<CharacterSet, bool> CharacterSet::most_compact_representation() const {
auto first_range = *ranges.begin();
if (first_range.min == 0 && first_range.max > 0) {
return { this->complement(), false };
} else {
return { *this, true };
}
}
void add_range(CharacterSet *self, CharacterRange new_range) {
set<CharacterRange> new_ranges;
for (auto range : self->ranges) {
if (range.is_adjacent(new_range)) {
new_range.add_range(range);
} else {
new_ranges.insert(range);
}
}
new_ranges.insert(new_range);
self->ranges = new_ranges;
}
void CharacterSet::union_with(const CharacterSet &other) {
for (auto &other_range : other.ranges) {
add_range(this, other_range);
}
}
void CharacterSet::accept(Visitor &visitor) const {
visitor.visit(this);
}
}
}

View file

@ -0,0 +1,64 @@
#ifndef __tree_sitter__character_set__
#define __tree_sitter__character_set__
#include "rule.h"
#include <set>
namespace tree_sitter {
namespace rules {
struct CharacterRange {
char min;
char max;
CharacterRange(char);
CharacterRange(char, char);
bool operator==(const CharacterRange &) const;
bool operator<(const CharacterRange &) const;
bool is_adjacent(const CharacterRange &) const;
void add_range(const CharacterRange &);
std::string to_string() const;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterRange> {
size_t operator()(const tree_sitter::rules::CharacterRange &range) const {
return (hash<char>()(range.min) ^ hash<char>()(range.max));
}
};
}
namespace tree_sitter {
namespace rules {
class CharacterSet : public Rule {
public:
CharacterSet();
CharacterSet(const std::set<CharacterRange> &ranges);
CharacterSet(const std::set<CharacterRange> &ranges, bool);
CharacterSet complement() const;
void union_with(const CharacterSet &other);
std::pair<CharacterSet, bool> most_compact_representation() const;
bool operator==(const Rule& other) const;
size_t hash_code() const;
rule_ptr copy() const;
std::string to_string() const;
void accept(Visitor &visitor) const;
std::set<CharacterRange> ranges;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterSet> : hash<tree_sitter::rules::Rule> {};
}
#endif

View file

@ -2,6 +2,7 @@
using std::string;
using std::hash;
using std::set;
namespace tree_sitter {
namespace rules {
@ -38,18 +39,6 @@ namespace tree_sitter {
return result;
}
rule_ptr char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
std::unordered_set<CharacterRange> matches;
while (has_more_input() && (peek() != ']'))
matches.insert(single_char());
return character(matches, is_affirmative);
}
rule_ptr atom() {
rule_ptr result;
switch (peek()) {
@ -63,7 +52,7 @@ namespace tree_sitter {
break;
case '[':
next();
result = char_set();
result = char_set().copy();
if (peek() != ']')
error("mismatched square brackets");
else
@ -73,13 +62,25 @@ namespace tree_sitter {
error("mismatched parens");
break;
default:
result = character({ single_char() }, true);
result = single_char().copy();
}
return result;
}
CharacterRange single_char() {
CharacterRange value('\0');
CharacterSet char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
CharacterSet result;
while (has_more_input() && (peek() != ']'))
result.union_with(single_char());
return is_affirmative ? result : result.complement();
}
CharacterSet single_char() {
CharacterSet value({ '\0' });
switch (peek()) {
case '\\':
next();
@ -91,28 +92,28 @@ namespace tree_sitter {
next();
if (peek() == '-') {
next();
value = CharacterRange({ first_char, peek() });
value = CharacterSet({ {first_char, peek()} }, true);
next();
} else {
value = first_char;
value = CharacterSet({ first_char });
}
}
return value;
}
CharacterRange escaped_char(char value) {
CharacterSet escaped_char(char value) {
switch (value) {
case '\\':
case '(':
case ')':
return value;
return CharacterSet({ value });
case 'w':
return CharClassWord;
return CharacterSet({{'a', 'z'}, {'A', 'Z'}}, true);
case 'd':
return CharClassDigit;
return CharacterSet({{'0', '9'}}, true);
default:
error("unrecognized escape sequence");
return '\0';
return CharacterSet();
}
}

View file

@ -3,6 +3,7 @@
using std::make_shared;
using std::string;
using std::initializer_list;
using std::set;
namespace tree_sitter {
namespace rules {
@ -11,15 +12,16 @@ namespace tree_sitter {
}
rule_ptr character(char value) {
return make_shared<CharacterSet>(value);
set<CharacterRange> ranges = { value };
return make_shared<CharacterSet>(ranges);
}
rule_ptr character(CharacterClass value) {
return make_shared<CharacterSet>(value);
rule_ptr character(const set<CharacterRange> &ranges) {
return make_shared<CharacterSet>(ranges);
}
rule_ptr character(const std::unordered_set<CharacterRange> &matches, bool is_affirmative) {
return make_shared<CharacterSet>(matches, is_affirmative);
rule_ptr character(const set<CharacterRange> &ranges, bool sign) {
return make_shared<CharacterSet>(ranges, sign);
}
rule_ptr choice(const initializer_list<rule_ptr> &rules) {

View file

@ -8,7 +8,7 @@
#include "seq.h"
#include "string.h"
#include "pattern.h"
#include "character.h"
#include "character_set.h"
#include "repeat.h"
#include "visitor.h"
@ -16,9 +16,8 @@ namespace tree_sitter {
namespace rules {
rule_ptr blank();
rule_ptr character(char value);
rule_ptr character(CharacterClass value);
rule_ptr character(const std::unordered_set<CharacterRange> &matches);
rule_ptr character(const std::unordered_set<CharacterRange> &matches, bool);
rule_ptr character(const std::set<CharacterRange> &matches);
rule_ptr character(const std::set<CharacterRange> &matches, bool);
rule_ptr choice(const std::initializer_list<rule_ptr> &rules);
rule_ptr pattern(const std::string &value);

View file

@ -7,7 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
12130605182C348F00FCF928 /* character.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130603182C348F00FCF928 /* character.cpp */; };
12130605182C348F00FCF928 /* character_set.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130603182C348F00FCF928 /* character_set.cpp */; };
1213060B182C389100FCF928 /* symbol.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12130609182C389100FCF928 /* symbol.cpp */; };
1213060E182C398300FCF928 /* choice.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1213060C182C398300FCF928 /* choice.cpp */; };
12130611182C3A1100FCF928 /* blank.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1213060F182C3A1100FCF928 /* blank.cpp */; };
@ -17,6 +17,7 @@
1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1225CC6318765693000D4723 /* prepare_grammar_spec.cpp */; };
1251209B1830145300C9B56A /* rule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209A1830145300C9B56A /* rule.cpp */; };
125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; };
12661BF418A1505A00A259FB /* character_set_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12661BF318A1505A00A259FB /* character_set_spec.cpp */; };
12AB465F188BD03E00DE79DF /* follow_sets.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12AB465D188BD03E00DE79DF /* follow_sets.cpp */; };
12AB4661188CB3A300DE79DF /* item_set_closure_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */; };
12BC470518822B27005AC502 /* parse_config.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12BC470318822A17005AC502 /* parse_config.cpp */; };
@ -79,8 +80,8 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
12130603182C348F00FCF928 /* character.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character.cpp; sourceTree = "<group>"; };
12130604182C348F00FCF928 /* character.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = character.h; sourceTree = "<group>"; };
12130603182C348F00FCF928 /* character_set.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character_set.cpp; sourceTree = "<group>"; };
12130604182C348F00FCF928 /* character_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = character_set.h; sourceTree = "<group>"; };
12130607182C374800FCF928 /* rule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rule.h; sourceTree = "<group>"; };
12130609182C389100FCF928 /* symbol.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = symbol.cpp; sourceTree = "<group>"; };
1213060A182C389100FCF928 /* symbol.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = symbol.h; sourceTree = "<group>"; };
@ -99,6 +100,7 @@
1251209A1830145300C9B56A /* rule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rule.cpp; sourceTree = "<group>"; };
125120A218307FFD00C9B56A /* test_grammars.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = test_grammars.h; path = spec/fixtures/grammars/test_grammars.h; sourceTree = SOURCE_ROOT; };
125120A3183083BD00C9B56A /* arithmetic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arithmetic.cpp; path = spec/fixtures/grammars/arithmetic.cpp; sourceTree = SOURCE_ROOT; };
12661BF318A1505A00A259FB /* character_set_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = character_set_spec.cpp; sourceTree = SOURCE_ROOT; };
12AB465D188BD03E00DE79DF /* follow_sets.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = follow_sets.cpp; sourceTree = "<group>"; };
12AB465E188BD03E00DE79DF /* follow_sets.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = follow_sets.h; sourceTree = "<group>"; };
12AB4660188CB3A300DE79DF /* item_set_closure_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = item_set_closure_spec.cpp; sourceTree = "<group>"; };
@ -189,8 +191,8 @@
children = (
1213060F182C3A1100FCF928 /* blank.cpp */,
12130610182C3A1100FCF928 /* blank.h */,
12130603182C348F00FCF928 /* character.cpp */,
12130604182C348F00FCF928 /* character.h */,
12130603182C348F00FCF928 /* character_set.cpp */,
12130604182C348F00FCF928 /* character_set.h */,
1213060C182C398300FCF928 /* choice.cpp */,
1213060D182C398300FCF928 /* choice.h */,
27A340F3EEB184C040521323 /* pattern.cpp */,
@ -269,6 +271,7 @@
children = (
121492EA181E200B008E9BDA /* rules_spec.cpp */,
12D136A0183570F5005F3369 /* pattern_spec.cpp */,
12661BF318A1505A00A259FB /* character_set_spec.cpp */,
);
name = rules;
path = compiler/rules;
@ -506,6 +509,7 @@
12FD40F7186A16020041A84E /* lex_table.cpp in Sources */,
12AB4661188CB3A300DE79DF /* item_set_closure_spec.cpp in Sources */,
12FD40E918641FB70041A84E /* rules.cpp in Sources */,
12661BF418A1505A00A259FB /* character_set_spec.cpp in Sources */,
12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */,
12E75A971891BD32001B8F10 /* json.cpp in Sources */,
12FD4061185E68470041A84E /* c_code.cpp in Sources */,
@ -522,7 +526,7 @@
12FD40E718639B910041A84E /* visitor.cpp in Sources */,
12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */,
12EDCFBC188205BF005A7A07 /* rule_transitions_spec.cpp in Sources */,
12130605182C348F00FCF928 /* character.cpp in Sources */,
12130605182C348F00FCF928 /* character_set.cpp in Sources */,
12EDCFB418820519005A7A07 /* compile.cpp in Sources */,
12BC470718830BC5005AC502 /* first_set_spec.cpp in Sources */,
1213060B182C389100FCF928 /* symbol.cpp in Sources */,