Refactor - represent char sets in terms of inclusions and exclusions

This commit is contained in:
Max Brunsfeld 2014-08-23 14:25:45 -07:00
parent 6f374fddff
commit 0bb5663f0f
21 changed files with 1004 additions and 565 deletions

View file

@ -59,7 +59,8 @@ LEX_FN() {
switch (lex_state) {
case 1:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(1);
@ -88,7 +89,8 @@ LEX_FN() {
START_TOKEN();
if (lookahead == 0)
ADVANCE(6);
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(5);
@ -117,7 +119,8 @@ LEX_FN() {
ACCEPT_TOKEN(ts_aux_sym_5);
case 12:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(12);
@ -150,7 +153,8 @@ LEX_FN() {
START_TOKEN();
if (lookahead == 0)
ADVANCE(6);
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(15);
@ -178,7 +182,8 @@ LEX_FN() {
START_TOKEN();
if (lookahead == 0)
ADVANCE(6);
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(15);

View file

@ -442,33 +442,33 @@ LEX_FN() {
ADVANCE(3);
LEX_ERROR();
case 36:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
if (lookahead == '\"')
ADVANCE(37);
if (lookahead == '\\')
ADVANCE(38);
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
LEX_ERROR();
case 37:
ACCEPT_TOKEN(ts_sym_string);
case 38:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
if (lookahead == '\"')
ADVANCE(39);
if (lookahead == '\\')
ADVANCE(38);
LEX_ERROR();
case 39:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
LEX_ERROR();
case 39:
if (lookahead == '\"')
ADVANCE(37);
if (lookahead == '\\')
ADVANCE(38);
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(36);
ACCEPT_TOKEN(ts_sym_string);
case 40:
ACCEPT_TOKEN(ts_aux_sym_1);
@ -644,7 +644,8 @@ LEX_FN() {
if (('0' <= lookahead && lookahead <= '9') ||
('A' <= lookahead && lookahead <= 'Z') ||
(lookahead == '_') ||
('a' <= lookahead && lookahead <= 'b') ||
(lookahead == 'a') ||
(lookahead == 'b') ||
('d' <= lookahead && lookahead <= 'z'))
ADVANCE(33);
if (lookahead == 'c')
@ -727,7 +728,8 @@ LEX_FN() {
if (('0' <= lookahead && lookahead <= '9') ||
('A' <= lookahead && lookahead <= 'Z') ||
(lookahead == '_') ||
('a' <= lookahead && lookahead <= 'b') ||
(lookahead == 'a') ||
(lookahead == 'b') ||
('d' <= lookahead && lookahead <= 'z'))
ADVANCE(33);
if (lookahead == 'c')
@ -940,7 +942,8 @@ LEX_FN() {
ADVANCE(88);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'e') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'q') ||
('s' <= lookahead && lookahead <= 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1310,7 +1313,8 @@ LEX_FN() {
ADVANCE(88);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'd') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'q') ||
('s' <= lookahead && lookahead <= 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1542,7 +1546,8 @@ LEX_FN() {
ADVANCE(115);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'e') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'q') ||
('s' <= lookahead && lookahead <= 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1617,7 +1622,8 @@ LEX_FN() {
ADVANCE(82);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'e') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'q') ||
('s' <= lookahead && lookahead <= 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1703,7 +1709,8 @@ LEX_FN() {
ADVANCE(145);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'e') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'q') ||
('s' <= lookahead && lookahead <= 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1850,9 +1857,11 @@ LEX_FN() {
ADVANCE(115);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'd') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'l') ||
('n' <= lookahead && lookahead <= 'o') ||
(lookahead == 'n') ||
(lookahead == 'o') ||
(lookahead == 'q') ||
(lookahead == 'u') ||
('w' <= lookahead && lookahead <= 'z'))
@ -1917,7 +1926,8 @@ LEX_FN() {
if (('0' <= lookahead && lookahead <= '9') ||
('A' <= lookahead && lookahead <= 'Z') ||
(lookahead == '_') ||
('a' <= lookahead && lookahead <= 'b') ||
(lookahead == 'a') ||
(lookahead == 'b') ||
('d' <= lookahead && lookahead <= 'z'))
ADVANCE(33);
if (lookahead == 'c')
@ -2005,7 +2015,8 @@ LEX_FN() {
if (('0' <= lookahead && lookahead <= '9') ||
('A' <= lookahead && lookahead <= 'Z') ||
(lookahead == '_') ||
('a' <= lookahead && lookahead <= 'b') ||
(lookahead == 'a') ||
(lookahead == 'b') ||
('d' <= lookahead && lookahead <= 'z'))
ADVANCE(33);
if (lookahead == 'c')
@ -2189,9 +2200,11 @@ LEX_FN() {
ADVANCE(115);
if (('A' <= lookahead && lookahead <= 'Z') ||
('a' <= lookahead && lookahead <= 'd') ||
('g' <= lookahead && lookahead <= 'h') ||
(lookahead == 'g') ||
(lookahead == 'h') ||
('j' <= lookahead && lookahead <= 'l') ||
('n' <= lookahead && lookahead <= 'o') ||
(lookahead == 'n') ||
(lookahead == 'o') ||
(lookahead == 'q') ||
(lookahead == 'u') ||
('w' <= lookahead && lookahead <= 'z'))

File diff suppressed because it is too large Load diff

View file

@ -60,7 +60,8 @@ LEX_FN() {
switch (lex_state) {
case 1:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(1);
@ -80,33 +81,33 @@ LEX_FN() {
ADVANCE(23);
LEX_ERROR();
case 2:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
if (lookahead == '\"')
ADVANCE(3);
if (lookahead == '\\')
ADVANCE(4);
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
LEX_ERROR();
case 3:
ACCEPT_TOKEN(ts_sym_string);
case 4:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
if (lookahead == '\"')
ADVANCE(5);
if (lookahead == '\\')
ADVANCE(4);
LEX_ERROR();
case 5:
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
LEX_ERROR();
case 5:
if (lookahead == '\"')
ADVANCE(3);
if (lookahead == '\\')
ADVANCE(4);
if (!((lookahead == '\"') ||
(lookahead == '\\')))
ADVANCE(2);
ACCEPT_TOKEN(ts_sym_string);
case 6:
if (lookahead == '.')
@ -186,7 +187,8 @@ LEX_FN() {
ACCEPT_TOKEN(ts_builtin_sym_end);
case 26:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(26);
@ -199,7 +201,8 @@ LEX_FN() {
ACCEPT_TOKEN(ts_aux_sym_4);
case 28:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(28);
@ -224,7 +227,8 @@ LEX_FN() {
ACCEPT_TOKEN(ts_aux_sym_6);
case 30:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(30);
@ -247,7 +251,8 @@ LEX_FN() {
LEX_ERROR();
case 33:
START_TOKEN();
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(33);
@ -292,7 +297,8 @@ LEX_FN() {
START_TOKEN();
if (lookahead == 0)
ADVANCE(25);
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(38);
@ -320,9 +326,11 @@ LEX_FN() {
ADVANCE(27);
LEX_ERROR();
case ts_lex_state_error:
START_TOKEN();
if (lookahead == 0)
ADVANCE(25);
if (('\t' <= lookahead && lookahead <= '\n') ||
if ((lookahead == '\t') ||
(lookahead == '\n') ||
(lookahead == '\r') ||
(lookahead == ' '))
ADVANCE(38);

View file

@ -1,6 +1,7 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/item_set_transitions.h"
#include "compiler/prepared_grammar.h"
#include "compiler/helpers/rule_helpers.h"
using namespace rules;
using namespace build_tables;
@ -11,16 +12,16 @@ describe("lexical item set transitions", []() {
describe("when two items in the set have transitions on the same character", [&]() {
it("merges the transitions by computing the union of the two item sets", [&]() {
LexItemSet set1({
LexItem(Symbol(1), character({ {'a', 'f'} })),
LexItem(Symbol(2), character({ {'e', 'x'} })) });
LexItem(Symbol(1), CharacterSet().include('a', 'f').copy()),
LexItem(Symbol(2), CharacterSet().include('e', 'x').copy()) });
AssertThat(char_transitions(set1), Equals(map<CharacterSet, LexItemSet>({
{ CharacterSet({ {'a', 'd'} }), LexItemSet({
{ CharacterSet().include('a', 'd'), LexItemSet({
LexItem(Symbol(1), blank()) }) },
{ CharacterSet({ {'e', 'f'} }), LexItemSet({
{ CharacterSet().include('e', 'f'), LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(2), blank()) }) },
{ CharacterSet({ {'g', 'x'} }), LexItemSet({
{ CharacterSet().include('g', 'x'), LexItemSet({
LexItem(Symbol(2), blank()) }) },
})));
});

View file

@ -6,7 +6,7 @@ using namespace build_tables;
START_TEST
describe("merging character set transitions", []() {
describe("merge_char_transitions", []() {
typedef map<CharacterSet, int> int_map;
auto do_merge = [&](int_map *left, const pair<CharacterSet, int> &new_pair) {
@ -18,20 +18,20 @@ describe("merging character set transitions", []() {
describe("when none of the transitions intersect", [&]() {
it("returns the union of the two sets of transitions", [&]() {
int_map map({
{ CharacterSet({ 'a', 'c' }), 1 },
{ CharacterSet({ 'x', 'y' }), 2 },
{ CharacterSet({ '1', '9' }), 4 },
{ CharacterSet().include('a').include('c'), 1 },
{ CharacterSet().include('x').include('y'), 2 },
{ CharacterSet().include('1').include('9'), 4 },
});
do_merge(&map, { CharacterSet({ ' ' }), 8 });
do_merge(&map, { CharacterSet({ '\t' }), 16 });
do_merge(&map, { CharacterSet().include(' '), 8 });
do_merge(&map, { CharacterSet().include('\t'), 16 });
AssertThat(map, Equals(int_map({
{ CharacterSet({ 'a', 'c' }), 1 },
{ CharacterSet({ 'x', 'y' }), 2 },
{ CharacterSet({ '1', '9' }), 4 },
{ CharacterSet({ ' ' }), 8 },
{ CharacterSet({ '\t' }), 16 },
{ CharacterSet().include('a').include('c'), 1 },
{ CharacterSet().include('x').include('y'), 2 },
{ CharacterSet().include('1').include('9'), 4 },
{ CharacterSet().include(' '), 8 },
{ CharacterSet().include('\t'), 16 },
})));
});
});
@ -39,18 +39,33 @@ describe("merging character set transitions", []() {
describe("when transitions intersect", [&]() {
it("merges the intersecting transitions using the provided function", [&]() {
int_map map({
{ CharacterSet({ {'a', 'f'}, {'A', 'F'} }), 1 },
{ CharacterSet({ {'0', '9'} }), 2 },
{ CharacterSet().include('a', 'f').include('A', 'F'), 1 },
{ CharacterSet().include('0', '9'), 2 },
});
do_merge(&map, { CharacterSet({ 'c' }), 4 });
do_merge(&map, { CharacterSet({ '3' }), 8 });
do_merge(&map, { CharacterSet().include('c'), 4 });
do_merge(&map, { CharacterSet().include('3'), 8 });
AssertThat(map, Equals(int_map({
{ CharacterSet({ {'a', 'b'}, {'d', 'f'}, {'A', 'F'} }), 1 },
{ CharacterSet({ {'c'} }), 5 },
{ CharacterSet({ {'0', '2'}, {'4', '9'} }), 2 },
{ CharacterSet({ '3' }), 10 },
{
CharacterSet()
.include('a', 'b')
.include('d', 'f')
.include('A', 'F'),
1
},
{
CharacterSet().include('c'),
5
},
{
CharacterSet().include('0', '2').include('4', '9'),
2
},
{
CharacterSet().include('3'),
10
},
})));
});
});
@ -58,15 +73,15 @@ describe("merging character set transitions", []() {
describe("when two of the right transitions intersect the same left transition", [&]() {
it("splits the left-hand transition correctly", [&]() {
int_map map1({
{ CharacterSet({ 'a', 'c' }), 1 },
{ CharacterSet().include('a').include('c'), 1 },
});
do_merge(&map1, { CharacterSet({ 'a' }), 2 });
do_merge(&map1, { CharacterSet({ 'c' }), 4 });
do_merge(&map1, { CharacterSet().include('a'), 2 });
do_merge(&map1, { CharacterSet().include('c'), 4 });
AssertThat(map1, Equals(int_map({
{ CharacterSet({ 'a' }), 3 },
{ CharacterSet({ 'c' }), 5 },
{ CharacterSet().include('a'), 3 },
{ CharacterSet().include('c'), 5 },
})));
});
});

View file

@ -8,7 +8,7 @@ using namespace build_tables;
START_TEST
describe("rule transitions", []() {
describe("sym_transitions", []() {
it("handles symbols", [&]() {
AssertThat(
sym_transitions(i_sym(1)),
@ -74,11 +74,26 @@ describe("rule transitions", []() {
})));
});
it("preserves metadata", [&]() {
map<MetadataKey, int> metadata_value({
{ PRECEDENCE, 5 }
});
rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
AssertThat(
sym_transitions(rule),
Equals(rule_map<Symbol>({
{ Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
})));
});
});
describe("char_transitions", []() {
it("handles characters", [&]() {
AssertThat(
char_transitions(character({ '1' })),
Equals(rule_map<CharacterSet>({
{ CharacterSet({ '1' }), blank() }
{ CharacterSet().include('1'), blank() }
})));
});
@ -92,9 +107,35 @@ describe("rule transitions", []() {
character({ { 'm', 'z' } }),
sym("y") }) })),
Equals(rule_map<CharacterSet>({
{ CharacterSet({ {'a','l'} }), sym("x") },
{ CharacterSet({ {'m','s'} }), choice({ sym("x"), sym("y") }) },
{ CharacterSet({ {'t','z'} }), sym("y") },
{ CharacterSet().include('a','l'), sym("x") },
{ CharacterSet().include('m','s'), choice({ sym("x"), sym("y") }) },
{ CharacterSet().include('t','z'), sym("y") },
})));
});
it("handles choices between whitelisted and blacklisted character sets", [&]() {
AssertThat(
char_transitions(seq({
choice({
character({ '/' }, false),
seq({
character({ '\\' }),
character({ '/' }) }) }),
character({ '/' }) })),
Equals(rule_map<CharacterSet>({
{ CharacterSet()
.include_all()
.exclude('/')
.exclude('\\'),
character({ '/' }) },
{ CharacterSet()
.include('\\'),
seq({
choice({
blank(),
character({ '/' }) }),
character({ '/' }) }) },
})));
});
@ -108,8 +149,8 @@ describe("rule transitions", []() {
character({ { 'a', 'z' } }),
sym("y") }) })),
Equals(rule_map<CharacterSet>({
{ CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
{ CharacterSet({ {'d', 'z'} }), sym("y") },
{ CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
{ CharacterSet().include('d', 'z'), sym("y") },
})));
AssertThat(
@ -121,10 +162,9 @@ describe("rule transitions", []() {
character({ {'a', 'c'} }),
sym("y") }) })),
Equals(rule_map<CharacterSet>({
{ CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
{ CharacterSet({ {'d', 'z'} }), sym("x") },
{ CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
{ CharacterSet().include('d', 'z'), sym("x") },
})));
});
it("handles blanks", [&]() {
@ -137,7 +177,7 @@ describe("rule transitions", []() {
char_transitions(rule),
Equals(rule_map<CharacterSet>({
{
CharacterSet({ 'a' }),
CharacterSet().include('a'),
seq({
character({ 'b' }),
rule,
@ -148,41 +188,9 @@ describe("rule transitions", []() {
AssertThat(
char_transitions(rule),
Equals(rule_map<CharacterSet>({
{ CharacterSet({ 'a' }), rule }
{ CharacterSet().include('a'), rule }
})));
});
it("preserves metadata", [&]() {
map<MetadataKey, int> metadata_value({
{ PRECEDENCE, 5 }
});
rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
AssertThat(
sym_transitions(rule),
Equals(rule_map<Symbol>({
{ Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
})));
});
describe("regression tests (somewhat redundant, should maybe be deleted later)", []() {
it("handles sequences that start with repeating characters", [&]() {
auto rule = seq({
choice({
repeat(character({ '"' }, false)),
blank(),
}),
character({ '"' }),
});
AssertThat(char_transitions(rule), Equals(rule_map<CharacterSet>({
{ CharacterSet({ '"' }).complement(), seq({
repeat(character({ '"' }, false)),
character({ '"' }), }) },
{ CharacterSet({ '"' }), blank() },
})));
});
});
});
END_TEST

View file

@ -44,7 +44,7 @@ class rule_list : public vector<pair<string, rule_ptr>> {
return true;
}
rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
vector<pair<string, rule_ptr>>(list) {}
};

View file

@ -9,14 +9,20 @@ namespace tree_sitter {
namespace rules {
rule_ptr character(const set<CharacterRange> &ranges) {
return make_shared<CharacterSet>(ranges);
return character(ranges, true);
}
rule_ptr character(const set<CharacterRange> &ranges, bool sign) {
if (sign)
return character(ranges);
else
return CharacterSet(ranges).complement().copy();
CharacterSet result;
if (sign) {
for (auto &range : ranges)
result.include(range.min, range.max);
} else {
result.include_all();
for (auto &range : ranges)
result.exclude(range.min, range.max);
}
return result.copy();
}
rule_ptr i_sym(size_t index) {

View file

@ -71,7 +71,7 @@ describe("expanding repeat rules in a grammar", []() {
AssertThat(match.rules, Equals(rule_list({
{ "rule0", seq({ i_aux_sym(0), i_aux_sym(1) }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", choice({
seq({

View file

@ -6,7 +6,7 @@ START_TEST
using namespace rules;
using prepare_grammar::parse_regex;
describe("parsing regex patterns", []() {
describe("parse_regex", []() {
struct ValidInputRow {
string description;
string pattern;
@ -23,7 +23,7 @@ describe("parsing regex patterns", []() {
{
"'.' characters as wildcards",
".",
CharacterSet({'\n'}).complement().copy()
character({ '\n' }, false)
},
{
@ -170,6 +170,19 @@ describe("parsing regex patterns", []() {
blank()
})
})
},
{
"choices containing negated character classes",
"/([^/]|(\\\\/))*/",
seq({
character({ '/' }),
repeat(choice({
character({ '/' }, false),
seq({ character({ '\\' }), character({ '/' }) }),
})),
character({ '/' }),
}),
}
};

View file

@ -5,106 +5,327 @@ using namespace rules;
START_TEST
describe("character sets", []() {
unsigned char max_char = 255;
describe("CharacterSet", []() {
describe("equality", [&]() {
it("returns true for identical character sets", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
describe("computing the complement", [&]() {
it("works for the set containing only the null character", [&]() {
CharacterSet set1({ '\0' });
auto set2 = set1.complement();
AssertThat(set2, Equals(CharacterSet({
{ 1, max_char }
})));
AssertThat(set2.complement(), Equals(set1));
CharacterSet set2 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
AssertThat(set1, Equals(set2));
});
it("works for single character sets", [&]() {
CharacterSet set1({ 'b' });
auto set2 = set1.complement();
AssertThat(set2, Equals(CharacterSet({
{ 0, 'a' },
{ 'c', max_char },
})));
AssertThat(set2.complement(), Equals(set1));
it("returns false for character sets that include different ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
CharacterSet set2 = CharacterSet()
.include('a', 'c')
.include('f', 'm');
AssertThat(set1, !Equals(set2));
AssertThat(set2, !Equals(set1));
});
it("returns false for character sets that exclude different ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'd')
.exclude('f', 'm');
CharacterSet set2 = CharacterSet()
.include_all()
.exclude('a', 'c')
.exclude('f', 'm');
AssertThat(set1, !Equals(set2));
AssertThat(set2, !Equals(set1));
});
it("returns false for character sets with different sign", [&]() {
CharacterSet set1 = CharacterSet().include_all();
CharacterSet set2 = CharacterSet();
AssertThat(set1, !Equals(set2));
AssertThat(set2, !Equals(set1));
});
});
describe("computing unions", [&]() {
it("works for disjoint sets", [&]() {
CharacterSet set({ {'a', 'z'} });
set.add_set(CharacterSet({ {'A', 'Z'} }));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'} })));
describe("hashing", [&]() {
it("returns the same number for identical character sets", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
CharacterSet set2 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
AssertThat(set1.hash_code(), Equals(set2.hash_code()));
});
it("works for sets with adjacent ranges", [&]() {
CharacterSet set({ CharacterRange('a', 'r') });
set.add_set(CharacterSet({ CharacterRange('s', 'z') }));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
it("returns different numbers for character sets that include different ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'd')
.include('f', 'm');
CharacterSet set2 = CharacterSet()
.include('a', 'c')
.include('f', 'm');
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
});
it("becomes the complete set when the complement is added", [&]() {
CharacterSet set({ 'c' });
auto complement = set.complement();
set.add_set(complement);
AssertThat(set, Equals(CharacterSet({ {0, max_char} })));
it("returns different numbers for character sets that exclude different ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'd')
.exclude('f', 'm');
CharacterSet set2 = CharacterSet()
.include_all()
.exclude('a', 'c')
.exclude('f', 'm');
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
});
it("works when the result becomes a continuous range", []() {
CharacterSet set({ {'a', 'd'}, {'f', 'z'} });
set.add_set(CharacterSet({ {'c', 'g'} }));
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
});
it("returns different numbers for character sets with different sign", [&]() {
CharacterSet set1 = CharacterSet().include_all();
CharacterSet set2 = CharacterSet();
it("does nothing for the set of all characters", [&]() {
CharacterSet set({ 'a' });
set.add_set(set.complement());
AssertThat(set, Equals(CharacterSet({ {'\0', max_char} })));
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
});
});
describe("subtracting sets", []() {
describe("::is_empty", [&]() {
it("returns true for empty character sets", [&]() {
AssertThat(CharacterSet().is_empty(), Equals(true));
});
it("returns false for full character sets", [&]() {
AssertThat(CharacterSet().include_all().is_empty(), Equals(false));
});
it("returns false for character sets that include some characters", [&]() {
AssertThat(CharacterSet().include('x').is_empty(), Equals(false));
});
});
describe("::include", [&]() {
describe("when the set has a whitelist of characters", [&]() {
it("adds included characters", [&]() {
CharacterSet set1 = CharacterSet().include('a', 'd');
AssertThat(set1, Equals(CharacterSet()
.include('a')
.include('b')
.include('c')
.include('d')));
});
});
describe("when the set has a blacklist of characters", [&]() {
it("removes excluded characters", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'g')
.include('c', 'e');
AssertThat(set1, Equals(CharacterSet()
.include_all()
.exclude('a')
.exclude('b')
.exclude('f')
.exclude('g')));
});
it("does nothing if the character are already not excluded", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.include('a', 'c');
AssertThat(set1, Equals(CharacterSet().include_all()));
});
});
});
describe("::exclude", [&]() {
describe("when the set has a whitelist of characters", [&]() {
it("removes included characters", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'g')
.exclude('c', 'e');
AssertThat(set1, Equals(CharacterSet()
.include('a')
.include('b')
.include('f')
.include('g')));
});
it("does nothing if the character's are already not included", [&]() {
CharacterSet set1 = CharacterSet().exclude('a', 'c');
AssertThat(set1, Equals(CharacterSet()));
});
});
describe("when the set has a blacklist of characters", [&]() {
it("removes excluded characters", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'd');
AssertThat(set1, Equals(CharacterSet()
.include_all()
.exclude('a')
.exclude('b')
.exclude('c')
.exclude('d')));
});
});
});
describe("::remove_set", []() {
CharacterSet intersection;
it("works for disjoint sets", [&]() {
CharacterSet set1({ {'a', 'z'} });
intersection = set1.remove_set(CharacterSet({ {'A', 'Z'} }));
AssertThat(set1, Equals(CharacterSet({ {'a', 'z'} })));
AssertThat(intersection, Equals(CharacterSet()));
describe("for a set with whitelisted characters", [&]() {
describe("when the subtractend has whitelisted characters", [&]() {
it("removes the included characters that the other set also includes", [&]() {
CharacterSet set1 = CharacterSet().include('a', 'z');
set1.remove_set(CharacterSet().include('d', 's'));
AssertThat(set1, Equals(CharacterSet()
.include('a', 'c')
.include('t', 'z')));
});
it("returns the characters that were removed", [&]() {
CharacterSet set1 = CharacterSet().include('a', 'z');
intersection = set1.remove_set(CharacterSet().include('d', 's'));
AssertThat(intersection, Equals(CharacterSet()
.include('d', 's')));
});
it("returns the empty set when the sets are disjoint", [&]() {
CharacterSet set1 = CharacterSet().include('a', 'z');
intersection = set1.remove_set(CharacterSet().include('A', 'Z'));
AssertThat(set1, Equals(CharacterSet().include('a', 'z')));
AssertThat(intersection, Equals(CharacterSet()));
});
});
describe("when the subtractend has blacklisted characters", [&]() {
it("removes the included characters that are not excluded by the other set", [&]() {
CharacterSet set1 = CharacterSet().include('a', 'f');
intersection = set1.remove_set(CharacterSet()
.include_all()
.exclude('d', 'z'));
AssertThat(set1, Equals(CharacterSet()
.include('d', 'f')));
AssertThat(intersection, Equals(CharacterSet()
.include('a', 'c')));
});
});
});
it("works when one set is a proper subset of the other", [&]() {
CharacterSet set1({ {'a','z'} });
intersection = set1.remove_set(CharacterSet({ {'d', 's'} }));
AssertThat(set1, Equals(CharacterSet({ {'a', 'c'}, {'t', 'z'} })));
AssertThat(intersection, Equals(CharacterSet({ {'d', 's'} })));
describe("for a set with blacklisted characters", [&]() {
describe("when the subtractend has whitelisted characters", [&]() {
it("adds the subtractend's inclusions to the receiver's exclusions", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'f');
intersection = set1.remove_set(CharacterSet()
.include('x', 'z'));
AssertThat(set1, Equals(CharacterSet()
.include_all()
.exclude('a', 'f')
.exclude('x', 'z')));
AssertThat(intersection, Equals(CharacterSet().include('x', 'z')));
});
});
describe("when the subtractend has blacklisted characters", [&]() {
it("includes only the characters excluded by the subtractend but not by the receiver", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'm');
set1.remove_set(CharacterSet()
.include_all()
.exclude('d', 'z'));
AssertThat(set1, Equals(CharacterSet()
.include('n', 'z')));
});
it("returns the characters excluded by neither set", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'm');
intersection = set1.remove_set(CharacterSet()
.include_all()
.exclude('d', 'z'));
AssertThat(intersection, Equals(CharacterSet()
.include_all()
.exclude('a', 'z')));
});
it("works when the sets are disjoint", [&]() {
CharacterSet set1 = CharacterSet()
.include_all()
.exclude('a', 'm');
intersection = set1.remove_set(CharacterSet()
.include_all()
.exclude('d', 'z'));
AssertThat(set1, Equals(CharacterSet()
.include('n', 'z')));
AssertThat(intersection, Equals(CharacterSet()
.include_all()
.exclude('a', 'z')));
});
});
});
});
describe("::included_ranges", [&]() {
it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'c')
.include('g')
.include('z');
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
CharacterRange('a', 'c'),
CharacterRange('g'),
CharacterRange('z'),
})));
});
it("works for a set that overlaps the right side", [&]() {
CharacterSet set1({ {'a','s'} });
intersection = set1.remove_set(CharacterSet({ {'m', 'z'} }));
AssertThat(set1, Equals(CharacterSet({ {'a', 'l'} })));
AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
});
it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'b')
.include('g')
.include('z');
it("works for a set that overlaps the left side", [&]() {
CharacterSet set2({ {'m','z'} });
intersection = set2.remove_set(CharacterSet({ {'a', 's'} }));
AssertThat(set2, Equals(CharacterSet({ {'t', 'z'} })));
AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
});
it("works for sets with multiple ranges", [&]() {
CharacterSet set1({ {'a', 'd'}, {'m', 'z'} });
intersection = set1.remove_set(CharacterSet({ {'c', 'o'}, {'s', 'x'} }));
AssertThat(set1, Equals(CharacterSet({ {'a', 'b'}, {'p', 'r'}, {'y', 'z'} })));
AssertThat(intersection, Equals(CharacterSet({ {'c', 'd'}, {'m', 'o'}, {'s', 'x'} })));
});
it("works when the result is empty", [&]() {
CharacterSet set1({ 'd' });
intersection = set1.remove_set(CharacterSet({ 'a', 'd', 'x' }));
AssertThat(set1, Equals(CharacterSet()));
AssertThat(intersection, Equals(CharacterSet({ 'd' })));
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
CharacterRange('a'),
CharacterRange('b'),
CharacterRange('g'),
CharacterRange('z'),
})));
});
});
});

View file

@ -38,8 +38,8 @@ class LexTableBuilder {
if (symbol == rules::ERROR())
continue;
else if (symbol == rules::END_OF_INPUT())
result.insert(
LexItem(symbol, after_separators(CharacterSet({ 0 }).copy())));
result.insert(LexItem(
symbol, after_separators(CharacterSet().include(0).copy())));
else if (symbol.is_token())
result.insert(
LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
@ -52,9 +52,11 @@ class LexTableBuilder {
if (pair == lex_state_ids.end()) {
LexStateId state_id = lex_table.add_state();
lex_state_ids[item_set] = state_id;
add_accept_token_actions(item_set, state_id);
add_advance_actions(item_set, state_id);
add_token_start(item_set, state_id);
return state_id;
} else {
return pair->second;
@ -100,10 +102,10 @@ class LexTableBuilder {
}
CharacterSet separator_set() const {
set<rules::CharacterRange> ranges;
CharacterSet result;
for (char c : lex_grammar.separators)
ranges.insert(c);
return CharacterSet(ranges);
result.include(c);
return result;
}
rules::rule_ptr after_separators(rules::rule_ptr rule) {

View file

@ -68,7 +68,7 @@ void merge_char_transition(std::map<rules::CharacterSet, T> *left,
left->insert(pairs_to_insert.begin(), pairs_to_insert.end());
if (!new_char_set.is_empty())
left->insert({ new_char_set, new_pair.second });
left->insert({ new_char_set, new_value });
}
} // namespace build_tables

View file

@ -245,12 +245,13 @@ class CCodeGenerator {
}
}
void condition_for_character_set(const rules::CharacterSet &set) {
if (set.ranges.size() == 1) {
add(condition_for_character_range(*set.ranges.begin()));
void condition_for_character_ranges(
const vector<rules::CharacterRange> &ranges) {
if (ranges.size() == 1) {
add(condition_for_character_range(*ranges.begin()));
} else {
bool first = true;
for (auto &match : set.ranges) {
for (auto &match : ranges) {
string part = "(" + condition_for_character_range(match) + ")";
if (first) {
add(part);
@ -263,15 +264,13 @@ class CCodeGenerator {
}
}
void condition_for_character_rule(const rules::CharacterSet &rule) {
pair<rules::CharacterSet, bool> representation =
rule.most_compact_representation();
if (representation.second) {
condition_for_character_set(representation.first);
} else {
void condition_for_character_set(const rules::CharacterSet &rule) {
if (rule.includes_all) {
add("!(");
condition_for_character_set(rule.complement());
condition_for_character_ranges(rule.excluded_ranges());
add(")");
} else {
condition_for_character_ranges(rule.included_ranges());
}
}
@ -319,7 +318,7 @@ class CCodeGenerator {
line("START_TOKEN();");
for (auto pair : lex_state.actions)
if (!pair.first.is_empty())
_if([&]() { condition_for_character_rule(pair.first); },
_if([&]() { condition_for_character_set(pair.first); },
[&]() { code_for_lex_actions(pair.second, expected_inputs); });
code_for_lex_actions(lex_state.default_action, expected_inputs);
}

View file

@ -28,7 +28,7 @@ class ExpandTokens : public rules::IdentityRuleFn {
rule_ptr apply_to(const String *rule) {
vector<rule_ptr> elements;
for (char val : rule->value)
elements.push_back(rules::CharacterSet({ val }).copy());
elements.push_back(rules::CharacterSet().include(val).copy());
return rules::Seq::Build(elements);
}

View file

@ -115,7 +115,7 @@ class PatternParser {
case ']': { return error("unmatched close square bracket"); }
case '.': {
next();
return { CharacterSet({ '\n' }).complement().copy(), nullptr };
return { CharacterSet().include_all().exclude('\n').copy(), nullptr };
}
default: {
auto pair = single_char();
@ -127,20 +127,24 @@ class PatternParser {
}
pair<CharacterSet, const GrammarError *> char_set() {
CharacterSet result;
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
result.include_all();
}
CharacterSet result;
while (has_more_input() && (peek() != ']')) {
auto pair = single_char();
if (pair.second)
return { CharacterSet(), pair.second };
result.add_set(pair.first);
if (is_affirmative)
result.add_set(pair.first);
else
result.remove_set(pair.first);
}
if (!is_affirmative)
result = result.complement();
return { result, nullptr };
}
@ -157,10 +161,10 @@ class PatternParser {
next();
if (peek() == '-') {
next();
value = CharacterSet({ CharacterRange(first_char, peek()) });
value = CharacterSet().include(first_char, peek());
next();
} else {
value = CharacterSet({ first_char });
value = CharacterSet().include(first_char);
}
}
return { value, nullptr };
@ -169,19 +173,20 @@ class PatternParser {
CharacterSet escaped_char(char value) {
switch (value) {
case 'a':
return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' } });
return CharacterSet().include('a', 'z').include('A', 'Z');
case 'w':
return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' }, { '0', '9' } });
return CharacterSet().include('a', 'z').include('A', 'Z').include('0',
'9');
case 'd':
return CharacterSet({ { '0', '9' } });
return CharacterSet().include('0', '9');
case 't':
return CharacterSet({ '\t' });
return CharacterSet().include('\t');
case 'n':
return CharacterSet({ '\n' });
return CharacterSet().include('\n');
case 'r':
return CharacterSet({ '\r' });
return CharacterSet().include('\r');
default:
return CharacterSet({ value });
return CharacterSet().include(value);
}
}

View file

@ -5,6 +5,7 @@
namespace tree_sitter {
namespace rules {
using std::ostream;
using std::string;
static const unsigned char MAX_CHAR = -1;
@ -53,5 +54,9 @@ string CharacterRange::to_string() const {
return string() + escape_character(min) + "-" + escape_character(max);
}
ostream &operator<<(ostream &stream, const CharacterRange &range) {
return stream << range.to_string();
}
} // namespace rules
} // namespace tree_sitter

View file

@ -20,6 +20,8 @@ struct CharacterRange {
std::string to_string() const;
};
std::ostream &operator<<(std::ostream &stream, const CharacterRange &rule);
} // namespace rules
} // namespace tree_sitter

View file

@ -1,6 +1,7 @@
#include "compiler/rules/character_set.h"
#include <string>
#include <utility>
#include <vector>
#include "compiler/rules/visitor.h"
namespace tree_sitter {
@ -9,32 +10,87 @@ namespace rules {
using std::string;
using std::hash;
using std::set;
using std::pair;
using std::initializer_list;
using std::vector;
static const unsigned char MAX_CHAR = -1;
static void add_range(set<uint32_t> *characters, CharacterRange range) {
for (uint32_t c = range.min; c <= range.max; c++)
characters->insert(c);
}
CharacterSet::CharacterSet() : ranges({}) {}
CharacterSet::CharacterSet(const set<CharacterRange> &ranges)
: ranges(ranges) {}
CharacterSet::CharacterSet(const initializer_list<CharacterRange> &ranges)
: ranges(ranges) {}
static void remove_range(set<uint32_t> *characters, CharacterRange range) {
for (uint32_t c = range.min; c <= range.max; c++)
characters->erase(c);
}
static set<uint32_t> remove_chars(set<uint32_t> *left,
const set<uint32_t> &right) {
set<uint32_t> result;
for (uint32_t c : right) {
if (left->erase(c))
result.insert(c);
}
return result;
}
static set<uint32_t> add_chars(set<uint32_t> *left,
const set<uint32_t> &right) {
set<uint32_t> result;
for (uint32_t c : right)
if (left->insert(c).second)
result.insert(c);
return result;
}
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &chars) {
vector<CharacterRange> result;
for (uint32_t c : chars) {
size_t size = result.size();
if (size >= 2 && result[size - 2].max == (c - 2)) {
result.pop_back();
result.back().max = c;
} else if (size >= 1) {
CharacterRange &last = result.back();
if (last.min < last.max && last.max == (c - 1))
last.max = c;
else
result.push_back(c);
} else {
result.push_back(c);
}
}
return result;
}
CharacterSet::CharacterSet()
: includes_all(false), included_chars({}), excluded_chars({}) {}
bool CharacterSet::operator==(const Rule &rule) const {
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
return other && (ranges == other->ranges);
return other && (includes_all == other->includes_all) &&
(included_chars == other->included_chars) &&
(excluded_chars == other->excluded_chars);
}
bool CharacterSet::operator<(const CharacterSet &other) const {
return ranges < other.ranges;
if (!includes_all && other.includes_all)
return true;
if (includes_all && !other.includes_all)
return false;
if (included_chars < other.included_chars)
return true;
if (other.included_chars < included_chars)
return false;
return excluded_chars < other.excluded_chars;
}
size_t CharacterSet::hash_code() const {
size_t result = std::hash<size_t>()(ranges.size());
for (auto &range : ranges) {
result ^= std::hash<unsigned char>()(range.min);
result ^= std::hash<unsigned char>()(range.max);
}
size_t result = hash<bool>()(includes_all);
result ^= hash<size_t>()(included_chars.size());
for (auto &c : included_chars)
result ^= hash<uint32_t>()(c);
result ^= hash<size_t>()(excluded_chars.size());
for (auto &c : excluded_chars)
result ^= hash<uint32_t>()(c);
return result;
}
@ -44,97 +100,88 @@ rule_ptr CharacterSet::copy() const {
string CharacterSet::to_string() const {
string result("(char");
for (auto &range : ranges)
result += " " + range.to_string();
if (includes_all)
result += " include_all";
if (!included_chars.empty()) {
result += " (include";
for (auto r : included_ranges())
result += string(" ") + r.to_string();
result += ")";
}
if (!excluded_chars.empty()) {
result += " (exclude";
for (auto r : excluded_ranges())
result += string(" ") + r.to_string();
result += ")";
}
return result + ")";
}
CharacterSet CharacterSet::complement() const {
CharacterSet result({ { 0, MAX_CHAR } });
result.remove_set(*this);
return result;
CharacterSet &CharacterSet::include_all() {
includes_all = true;
return *this;
}
std::pair<CharacterSet, bool> CharacterSet::most_compact_representation()
const {
auto first_range = *ranges.begin();
if (first_range.min == 0 && first_range.max > 0) {
return { this->complement(), false };
} else {
return { *this, true };
}
CharacterSet &CharacterSet::include(uint32_t min, uint32_t max) {
if (includes_all)
remove_range(&excluded_chars, CharacterRange(min, max));
else
add_range(&included_chars, CharacterRange(min, max));
return *this;
}
void add_range(CharacterSet *self, CharacterRange addition) {
set<CharacterRange> new_ranges;
for (auto range : self->ranges) {
bool is_adjacent = false;
if (range.min < addition.min && range.max >= addition.min - 1) {
is_adjacent = true;
addition.min = range.min;
}
if (range.max > addition.max && range.min <= addition.max + 1) {
is_adjacent = true;
addition.max = range.max;
}
if (!is_adjacent) {
new_ranges.insert(range);
}
}
new_ranges.insert(addition);
self->ranges = new_ranges;
CharacterSet &CharacterSet::exclude(uint32_t min, uint32_t max) {
if (includes_all)
add_range(&excluded_chars, CharacterRange(min, max));
else
remove_range(&included_chars, CharacterRange(min, max));
return *this;
}
CharacterSet remove_range(CharacterSet *self, CharacterRange range_to_remove) {
CharacterSet removed_set;
set<CharacterRange> new_ranges;
for (auto range : self->ranges) {
if (range_to_remove.min <= range.min) {
if (range_to_remove.max < range.min) {
new_ranges.insert(range);
} else if (range_to_remove.max < range.max) {
new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
add_range(&removed_set, CharacterRange(range.min, range_to_remove.max));
} else {
add_range(&removed_set, range);
}
} else if (range_to_remove.min <= range.max) {
if (range_to_remove.max < range.max) {
new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
add_range(&removed_set, range_to_remove);
} else {
new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
add_range(&removed_set, CharacterRange(range_to_remove.min, range.max));
}
} else {
new_ranges.insert(range);
}
}
self->ranges = new_ranges;
return removed_set;
}
CharacterSet &CharacterSet::include(uint32_t c) { return include(c, c); }
bool CharacterSet::is_empty() const { return ranges.empty(); }
CharacterSet &CharacterSet::exclude(uint32_t c) { return exclude(c, c); }
bool CharacterSet::is_empty() const {
return !includes_all && included_chars.empty();
}
void CharacterSet::add_set(const CharacterSet &other) {
for (auto &other_range : other.ranges) {
add_range(this, other_range);
}
for (uint32_t c : other.included_chars)
included_chars.insert(c);
}
CharacterSet CharacterSet::remove_set(const CharacterSet &other) {
CharacterSet result;
for (auto &other_range : other.ranges) {
auto removed_set = remove_range(this, other_range);
result.add_set(removed_set);
if (includes_all) {
if (other.includes_all) {
result.includes_all = true;
result.excluded_chars = excluded_chars;
included_chars = add_chars(&result.excluded_chars, other.excluded_chars);
excluded_chars = {};
includes_all = false;
} else {
result.included_chars = add_chars(&excluded_chars, other.included_chars);
}
} else {
if (other.includes_all) {
result.included_chars = included_chars;
included_chars =
remove_chars(&result.included_chars, other.excluded_chars);
} else {
result.included_chars =
remove_chars(&included_chars, other.included_chars);
}
}
return result;
}
CharacterSet CharacterSet::intersect(const CharacterSet &set) const {
CharacterSet copy = *this;
return copy.remove_set(set);
vector<CharacterRange> CharacterSet::included_ranges() const {
return consolidate_ranges(included_chars);
}
vector<CharacterRange> CharacterSet::excluded_ranges() const {
return consolidate_ranges(excluded_chars);
}
void CharacterSet::accept(Visitor *visitor) const { visitor->visit(this); }

View file

@ -1,10 +1,11 @@
#ifndef COMPILER_RULES_CHARACTER_SET_H_
#define COMPILER_RULES_CHARACTER_SET_H_
#include <initializer_list>
#include <set>
#include <stdint.h>
#include <string>
#include <utility>
#include <vector>
#include "compiler/rules/rule.h"
#include "compiler/rules/character_range.h"
@ -14,8 +15,12 @@ namespace rules {
class CharacterSet : public Rule {
public:
CharacterSet();
explicit CharacterSet(const std::set<CharacterRange> &ranges);
explicit CharacterSet(const std::initializer_list<CharacterRange> &ranges);
CharacterSet &include_all();
CharacterSet &include(uint32_t c);
CharacterSet &include(uint32_t min, uint32_t max);
CharacterSet &exclude(uint32_t c);
CharacterSet &exclude(uint32_t min, uint32_t max);
bool operator==(const Rule &other) const;
bool operator<(const CharacterSet &) const;
@ -26,12 +31,14 @@ class CharacterSet : public Rule {
void add_set(const CharacterSet &other);
CharacterSet remove_set(const CharacterSet &other);
CharacterSet complement() const;
CharacterSet intersect(const CharacterSet &) const;
std::pair<CharacterSet, bool> most_compact_representation() const;
bool is_empty() const;
std::set<CharacterRange> ranges;
std::vector<CharacterRange> included_ranges() const;
std::vector<CharacterRange> excluded_ranges() const;
bool includes_all;
std::set<uint32_t> included_chars;
std::set<uint32_t> excluded_chars;
};
} // namespace rules