Refactor - represent char sets in terms of inclusions and exclusions
This commit is contained in:
parent
6f374fddff
commit
0bb5663f0f
21 changed files with 1004 additions and 565 deletions
|
|
@ -59,7 +59,8 @@ LEX_FN() {
|
|||
switch (lex_state) {
|
||||
case 1:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(1);
|
||||
|
|
@ -88,7 +89,8 @@ LEX_FN() {
|
|||
START_TOKEN();
|
||||
if (lookahead == 0)
|
||||
ADVANCE(6);
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(5);
|
||||
|
|
@ -117,7 +119,8 @@ LEX_FN() {
|
|||
ACCEPT_TOKEN(ts_aux_sym_5);
|
||||
case 12:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(12);
|
||||
|
|
@ -150,7 +153,8 @@ LEX_FN() {
|
|||
START_TOKEN();
|
||||
if (lookahead == 0)
|
||||
ADVANCE(6);
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(15);
|
||||
|
|
@ -178,7 +182,8 @@ LEX_FN() {
|
|||
START_TOKEN();
|
||||
if (lookahead == 0)
|
||||
ADVANCE(6);
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(15);
|
||||
|
|
|
|||
|
|
@ -442,33 +442,33 @@ LEX_FN() {
|
|||
ADVANCE(3);
|
||||
LEX_ERROR();
|
||||
case 36:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(37);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
LEX_ERROR();
|
||||
case 37:
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
case 38:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(39);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
LEX_ERROR();
|
||||
case 39:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
LEX_ERROR();
|
||||
case 39:
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(37);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(38);
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(36);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
case 40:
|
||||
ACCEPT_TOKEN(ts_aux_sym_1);
|
||||
|
|
@ -644,7 +644,8 @@ LEX_FN() {
|
|||
if (('0' <= lookahead && lookahead <= '9') ||
|
||||
('A' <= lookahead && lookahead <= 'Z') ||
|
||||
(lookahead == '_') ||
|
||||
('a' <= lookahead && lookahead <= 'b') ||
|
||||
(lookahead == 'a') ||
|
||||
(lookahead == 'b') ||
|
||||
('d' <= lookahead && lookahead <= 'z'))
|
||||
ADVANCE(33);
|
||||
if (lookahead == 'c')
|
||||
|
|
@ -727,7 +728,8 @@ LEX_FN() {
|
|||
if (('0' <= lookahead && lookahead <= '9') ||
|
||||
('A' <= lookahead && lookahead <= 'Z') ||
|
||||
(lookahead == '_') ||
|
||||
('a' <= lookahead && lookahead <= 'b') ||
|
||||
(lookahead == 'a') ||
|
||||
(lookahead == 'b') ||
|
||||
('d' <= lookahead && lookahead <= 'z'))
|
||||
ADVANCE(33);
|
||||
if (lookahead == 'c')
|
||||
|
|
@ -940,7 +942,8 @@ LEX_FN() {
|
|||
ADVANCE(88);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'e') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'q') ||
|
||||
('s' <= lookahead && lookahead <= 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1310,7 +1313,8 @@ LEX_FN() {
|
|||
ADVANCE(88);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'd') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'q') ||
|
||||
('s' <= lookahead && lookahead <= 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1542,7 +1546,8 @@ LEX_FN() {
|
|||
ADVANCE(115);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'e') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'q') ||
|
||||
('s' <= lookahead && lookahead <= 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1617,7 +1622,8 @@ LEX_FN() {
|
|||
ADVANCE(82);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'e') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'q') ||
|
||||
('s' <= lookahead && lookahead <= 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1703,7 +1709,8 @@ LEX_FN() {
|
|||
ADVANCE(145);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'e') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'q') ||
|
||||
('s' <= lookahead && lookahead <= 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1850,9 +1857,11 @@ LEX_FN() {
|
|||
ADVANCE(115);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'd') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'l') ||
|
||||
('n' <= lookahead && lookahead <= 'o') ||
|
||||
(lookahead == 'n') ||
|
||||
(lookahead == 'o') ||
|
||||
(lookahead == 'q') ||
|
||||
(lookahead == 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
@ -1917,7 +1926,8 @@ LEX_FN() {
|
|||
if (('0' <= lookahead && lookahead <= '9') ||
|
||||
('A' <= lookahead && lookahead <= 'Z') ||
|
||||
(lookahead == '_') ||
|
||||
('a' <= lookahead && lookahead <= 'b') ||
|
||||
(lookahead == 'a') ||
|
||||
(lookahead == 'b') ||
|
||||
('d' <= lookahead && lookahead <= 'z'))
|
||||
ADVANCE(33);
|
||||
if (lookahead == 'c')
|
||||
|
|
@ -2005,7 +2015,8 @@ LEX_FN() {
|
|||
if (('0' <= lookahead && lookahead <= '9') ||
|
||||
('A' <= lookahead && lookahead <= 'Z') ||
|
||||
(lookahead == '_') ||
|
||||
('a' <= lookahead && lookahead <= 'b') ||
|
||||
(lookahead == 'a') ||
|
||||
(lookahead == 'b') ||
|
||||
('d' <= lookahead && lookahead <= 'z'))
|
||||
ADVANCE(33);
|
||||
if (lookahead == 'c')
|
||||
|
|
@ -2189,9 +2200,11 @@ LEX_FN() {
|
|||
ADVANCE(115);
|
||||
if (('A' <= lookahead && lookahead <= 'Z') ||
|
||||
('a' <= lookahead && lookahead <= 'd') ||
|
||||
('g' <= lookahead && lookahead <= 'h') ||
|
||||
(lookahead == 'g') ||
|
||||
(lookahead == 'h') ||
|
||||
('j' <= lookahead && lookahead <= 'l') ||
|
||||
('n' <= lookahead && lookahead <= 'o') ||
|
||||
(lookahead == 'n') ||
|
||||
(lookahead == 'o') ||
|
||||
(lookahead == 'q') ||
|
||||
(lookahead == 'u') ||
|
||||
('w' <= lookahead && lookahead <= 'z'))
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -60,7 +60,8 @@ LEX_FN() {
|
|||
switch (lex_state) {
|
||||
case 1:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(1);
|
||||
|
|
@ -80,33 +81,33 @@ LEX_FN() {
|
|||
ADVANCE(23);
|
||||
LEX_ERROR();
|
||||
case 2:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(3);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
LEX_ERROR();
|
||||
case 3:
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
case 4:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(5);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
LEX_ERROR();
|
||||
case 5:
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
LEX_ERROR();
|
||||
case 5:
|
||||
if (lookahead == '\"')
|
||||
ADVANCE(3);
|
||||
if (lookahead == '\\')
|
||||
ADVANCE(4);
|
||||
if (!((lookahead == '\"') ||
|
||||
(lookahead == '\\')))
|
||||
ADVANCE(2);
|
||||
ACCEPT_TOKEN(ts_sym_string);
|
||||
case 6:
|
||||
if (lookahead == '.')
|
||||
|
|
@ -186,7 +187,8 @@ LEX_FN() {
|
|||
ACCEPT_TOKEN(ts_builtin_sym_end);
|
||||
case 26:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(26);
|
||||
|
|
@ -199,7 +201,8 @@ LEX_FN() {
|
|||
ACCEPT_TOKEN(ts_aux_sym_4);
|
||||
case 28:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(28);
|
||||
|
|
@ -224,7 +227,8 @@ LEX_FN() {
|
|||
ACCEPT_TOKEN(ts_aux_sym_6);
|
||||
case 30:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(30);
|
||||
|
|
@ -247,7 +251,8 @@ LEX_FN() {
|
|||
LEX_ERROR();
|
||||
case 33:
|
||||
START_TOKEN();
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(33);
|
||||
|
|
@ -292,7 +297,8 @@ LEX_FN() {
|
|||
START_TOKEN();
|
||||
if (lookahead == 0)
|
||||
ADVANCE(25);
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(38);
|
||||
|
|
@ -320,9 +326,11 @@ LEX_FN() {
|
|||
ADVANCE(27);
|
||||
LEX_ERROR();
|
||||
case ts_lex_state_error:
|
||||
START_TOKEN();
|
||||
if (lookahead == 0)
|
||||
ADVANCE(25);
|
||||
if (('\t' <= lookahead && lookahead <= '\n') ||
|
||||
if ((lookahead == '\t') ||
|
||||
(lookahead == '\n') ||
|
||||
(lookahead == '\r') ||
|
||||
(lookahead == ' '))
|
||||
ADVANCE(38);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "compiler/compiler_spec_helper.h"
|
||||
#include "compiler/build_tables/item_set_transitions.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/helpers/rule_helpers.h"
|
||||
|
||||
using namespace rules;
|
||||
using namespace build_tables;
|
||||
|
|
@ -11,16 +12,16 @@ describe("lexical item set transitions", []() {
|
|||
describe("when two items in the set have transitions on the same character", [&]() {
|
||||
it("merges the transitions by computing the union of the two item sets", [&]() {
|
||||
LexItemSet set1({
|
||||
LexItem(Symbol(1), character({ {'a', 'f'} })),
|
||||
LexItem(Symbol(2), character({ {'e', 'x'} })) });
|
||||
LexItem(Symbol(1), CharacterSet().include('a', 'f').copy()),
|
||||
LexItem(Symbol(2), CharacterSet().include('e', 'x').copy()) });
|
||||
|
||||
AssertThat(char_transitions(set1), Equals(map<CharacterSet, LexItemSet>({
|
||||
{ CharacterSet({ {'a', 'd'} }), LexItemSet({
|
||||
{ CharacterSet().include('a', 'd'), LexItemSet({
|
||||
LexItem(Symbol(1), blank()) }) },
|
||||
{ CharacterSet({ {'e', 'f'} }), LexItemSet({
|
||||
{ CharacterSet().include('e', 'f'), LexItemSet({
|
||||
LexItem(Symbol(1), blank()),
|
||||
LexItem(Symbol(2), blank()) }) },
|
||||
{ CharacterSet({ {'g', 'x'} }), LexItemSet({
|
||||
{ CharacterSet().include('g', 'x'), LexItemSet({
|
||||
LexItem(Symbol(2), blank()) }) },
|
||||
})));
|
||||
});
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ using namespace build_tables;
|
|||
|
||||
START_TEST
|
||||
|
||||
describe("merging character set transitions", []() {
|
||||
describe("merge_char_transitions", []() {
|
||||
typedef map<CharacterSet, int> int_map;
|
||||
|
||||
auto do_merge = [&](int_map *left, const pair<CharacterSet, int> &new_pair) {
|
||||
|
|
@ -18,20 +18,20 @@ describe("merging character set transitions", []() {
|
|||
describe("when none of the transitions intersect", [&]() {
|
||||
it("returns the union of the two sets of transitions", [&]() {
|
||||
int_map map({
|
||||
{ CharacterSet({ 'a', 'c' }), 1 },
|
||||
{ CharacterSet({ 'x', 'y' }), 2 },
|
||||
{ CharacterSet({ '1', '9' }), 4 },
|
||||
{ CharacterSet().include('a').include('c'), 1 },
|
||||
{ CharacterSet().include('x').include('y'), 2 },
|
||||
{ CharacterSet().include('1').include('9'), 4 },
|
||||
});
|
||||
|
||||
do_merge(&map, { CharacterSet({ ' ' }), 8 });
|
||||
do_merge(&map, { CharacterSet({ '\t' }), 16 });
|
||||
do_merge(&map, { CharacterSet().include(' '), 8 });
|
||||
do_merge(&map, { CharacterSet().include('\t'), 16 });
|
||||
|
||||
AssertThat(map, Equals(int_map({
|
||||
{ CharacterSet({ 'a', 'c' }), 1 },
|
||||
{ CharacterSet({ 'x', 'y' }), 2 },
|
||||
{ CharacterSet({ '1', '9' }), 4 },
|
||||
{ CharacterSet({ ' ' }), 8 },
|
||||
{ CharacterSet({ '\t' }), 16 },
|
||||
{ CharacterSet().include('a').include('c'), 1 },
|
||||
{ CharacterSet().include('x').include('y'), 2 },
|
||||
{ CharacterSet().include('1').include('9'), 4 },
|
||||
{ CharacterSet().include(' '), 8 },
|
||||
{ CharacterSet().include('\t'), 16 },
|
||||
})));
|
||||
});
|
||||
});
|
||||
|
|
@ -39,18 +39,33 @@ describe("merging character set transitions", []() {
|
|||
describe("when transitions intersect", [&]() {
|
||||
it("merges the intersecting transitions using the provided function", [&]() {
|
||||
int_map map({
|
||||
{ CharacterSet({ {'a', 'f'}, {'A', 'F'} }), 1 },
|
||||
{ CharacterSet({ {'0', '9'} }), 2 },
|
||||
{ CharacterSet().include('a', 'f').include('A', 'F'), 1 },
|
||||
{ CharacterSet().include('0', '9'), 2 },
|
||||
});
|
||||
|
||||
do_merge(&map, { CharacterSet({ 'c' }), 4 });
|
||||
do_merge(&map, { CharacterSet({ '3' }), 8 });
|
||||
do_merge(&map, { CharacterSet().include('c'), 4 });
|
||||
do_merge(&map, { CharacterSet().include('3'), 8 });
|
||||
|
||||
AssertThat(map, Equals(int_map({
|
||||
{ CharacterSet({ {'a', 'b'}, {'d', 'f'}, {'A', 'F'} }), 1 },
|
||||
{ CharacterSet({ {'c'} }), 5 },
|
||||
{ CharacterSet({ {'0', '2'}, {'4', '9'} }), 2 },
|
||||
{ CharacterSet({ '3' }), 10 },
|
||||
{
|
||||
CharacterSet()
|
||||
.include('a', 'b')
|
||||
.include('d', 'f')
|
||||
.include('A', 'F'),
|
||||
1
|
||||
},
|
||||
{
|
||||
CharacterSet().include('c'),
|
||||
5
|
||||
},
|
||||
{
|
||||
CharacterSet().include('0', '2').include('4', '9'),
|
||||
2
|
||||
},
|
||||
{
|
||||
CharacterSet().include('3'),
|
||||
10
|
||||
},
|
||||
})));
|
||||
});
|
||||
});
|
||||
|
|
@ -58,15 +73,15 @@ describe("merging character set transitions", []() {
|
|||
describe("when two of the right transitions intersect the same left transition", [&]() {
|
||||
it("splits the left-hand transition correctly", [&]() {
|
||||
int_map map1({
|
||||
{ CharacterSet({ 'a', 'c' }), 1 },
|
||||
{ CharacterSet().include('a').include('c'), 1 },
|
||||
});
|
||||
|
||||
do_merge(&map1, { CharacterSet({ 'a' }), 2 });
|
||||
do_merge(&map1, { CharacterSet({ 'c' }), 4 });
|
||||
do_merge(&map1, { CharacterSet().include('a'), 2 });
|
||||
do_merge(&map1, { CharacterSet().include('c'), 4 });
|
||||
|
||||
AssertThat(map1, Equals(int_map({
|
||||
{ CharacterSet({ 'a' }), 3 },
|
||||
{ CharacterSet({ 'c' }), 5 },
|
||||
{ CharacterSet().include('a'), 3 },
|
||||
{ CharacterSet().include('c'), 5 },
|
||||
})));
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ using namespace build_tables;
|
|||
|
||||
START_TEST
|
||||
|
||||
describe("rule transitions", []() {
|
||||
describe("sym_transitions", []() {
|
||||
it("handles symbols", [&]() {
|
||||
AssertThat(
|
||||
sym_transitions(i_sym(1)),
|
||||
|
|
@ -74,11 +74,26 @@ describe("rule transitions", []() {
|
|||
})));
|
||||
});
|
||||
|
||||
it("preserves metadata", [&]() {
|
||||
map<MetadataKey, int> metadata_value({
|
||||
{ PRECEDENCE, 5 }
|
||||
});
|
||||
|
||||
rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
|
||||
AssertThat(
|
||||
sym_transitions(rule),
|
||||
Equals(rule_map<Symbol>({
|
||||
{ Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
|
||||
})));
|
||||
});
|
||||
});
|
||||
|
||||
describe("char_transitions", []() {
|
||||
it("handles characters", [&]() {
|
||||
AssertThat(
|
||||
char_transitions(character({ '1' })),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ '1' }), blank() }
|
||||
{ CharacterSet().include('1'), blank() }
|
||||
})));
|
||||
});
|
||||
|
||||
|
|
@ -92,9 +107,35 @@ describe("rule transitions", []() {
|
|||
character({ { 'm', 'z' } }),
|
||||
sym("y") }) })),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ {'a','l'} }), sym("x") },
|
||||
{ CharacterSet({ {'m','s'} }), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet({ {'t','z'} }), sym("y") },
|
||||
{ CharacterSet().include('a','l'), sym("x") },
|
||||
{ CharacterSet().include('m','s'), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet().include('t','z'), sym("y") },
|
||||
})));
|
||||
});
|
||||
|
||||
it("handles choices between whitelisted and blacklisted character sets", [&]() {
|
||||
AssertThat(
|
||||
char_transitions(seq({
|
||||
choice({
|
||||
character({ '/' }, false),
|
||||
seq({
|
||||
character({ '\\' }),
|
||||
character({ '/' }) }) }),
|
||||
character({ '/' }) })),
|
||||
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet()
|
||||
.include_all()
|
||||
.exclude('/')
|
||||
.exclude('\\'),
|
||||
character({ '/' }) },
|
||||
{ CharacterSet()
|
||||
.include('\\'),
|
||||
seq({
|
||||
choice({
|
||||
blank(),
|
||||
character({ '/' }) }),
|
||||
character({ '/' }) }) },
|
||||
})));
|
||||
});
|
||||
|
||||
|
|
@ -108,8 +149,8 @@ describe("rule transitions", []() {
|
|||
character({ { 'a', 'z' } }),
|
||||
sym("y") }) })),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet({ {'d', 'z'} }), sym("y") },
|
||||
{ CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet().include('d', 'z'), sym("y") },
|
||||
})));
|
||||
|
||||
AssertThat(
|
||||
|
|
@ -121,10 +162,9 @@ describe("rule transitions", []() {
|
|||
character({ {'a', 'c'} }),
|
||||
sym("y") }) })),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet({ {'d', 'z'} }), sym("x") },
|
||||
{ CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
|
||||
{ CharacterSet().include('d', 'z'), sym("x") },
|
||||
})));
|
||||
|
||||
});
|
||||
|
||||
it("handles blanks", [&]() {
|
||||
|
|
@ -137,7 +177,7 @@ describe("rule transitions", []() {
|
|||
char_transitions(rule),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{
|
||||
CharacterSet({ 'a' }),
|
||||
CharacterSet().include('a'),
|
||||
seq({
|
||||
character({ 'b' }),
|
||||
rule,
|
||||
|
|
@ -148,41 +188,9 @@ describe("rule transitions", []() {
|
|||
AssertThat(
|
||||
char_transitions(rule),
|
||||
Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ 'a' }), rule }
|
||||
{ CharacterSet().include('a'), rule }
|
||||
})));
|
||||
});
|
||||
|
||||
it("preserves metadata", [&]() {
|
||||
map<MetadataKey, int> metadata_value({
|
||||
{ PRECEDENCE, 5 }
|
||||
});
|
||||
|
||||
rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
|
||||
AssertThat(
|
||||
sym_transitions(rule),
|
||||
Equals(rule_map<Symbol>({
|
||||
{ Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
|
||||
})));
|
||||
});
|
||||
|
||||
describe("regression tests (somewhat redundant, should maybe be deleted later)", []() {
|
||||
it("handles sequences that start with repeating characters", [&]() {
|
||||
auto rule = seq({
|
||||
choice({
|
||||
repeat(character({ '"' }, false)),
|
||||
blank(),
|
||||
}),
|
||||
character({ '"' }),
|
||||
});
|
||||
|
||||
AssertThat(char_transitions(rule), Equals(rule_map<CharacterSet>({
|
||||
{ CharacterSet({ '"' }).complement(), seq({
|
||||
repeat(character({ '"' }, false)),
|
||||
character({ '"' }), }) },
|
||||
{ CharacterSet({ '"' }), blank() },
|
||||
})));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
END_TEST
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ class rule_list : public vector<pair<string, rule_ptr>> {
|
|||
return true;
|
||||
}
|
||||
|
||||
rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
|
||||
rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
|
||||
vector<pair<string, rule_ptr>>(list) {}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -9,14 +9,20 @@ namespace tree_sitter {
|
|||
|
||||
namespace rules {
|
||||
rule_ptr character(const set<CharacterRange> &ranges) {
|
||||
return make_shared<CharacterSet>(ranges);
|
||||
return character(ranges, true);
|
||||
}
|
||||
|
||||
rule_ptr character(const set<CharacterRange> &ranges, bool sign) {
|
||||
if (sign)
|
||||
return character(ranges);
|
||||
else
|
||||
return CharacterSet(ranges).complement().copy();
|
||||
CharacterSet result;
|
||||
if (sign) {
|
||||
for (auto &range : ranges)
|
||||
result.include(range.min, range.max);
|
||||
} else {
|
||||
result.include_all();
|
||||
for (auto &range : ranges)
|
||||
result.exclude(range.min, range.max);
|
||||
}
|
||||
return result.copy();
|
||||
}
|
||||
|
||||
rule_ptr i_sym(size_t index) {
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ describe("expanding repeat rules in a grammar", []() {
|
|||
AssertThat(match.rules, Equals(rule_list({
|
||||
{ "rule0", seq({ i_aux_sym(0), i_aux_sym(1) }) },
|
||||
})));
|
||||
|
||||
|
||||
AssertThat(match.aux_rules, Equals(rule_list({
|
||||
{ "rule0_repeat0", choice({
|
||||
seq({
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ START_TEST
|
|||
using namespace rules;
|
||||
using prepare_grammar::parse_regex;
|
||||
|
||||
describe("parsing regex patterns", []() {
|
||||
describe("parse_regex", []() {
|
||||
struct ValidInputRow {
|
||||
string description;
|
||||
string pattern;
|
||||
|
|
@ -23,7 +23,7 @@ describe("parsing regex patterns", []() {
|
|||
{
|
||||
"'.' characters as wildcards",
|
||||
".",
|
||||
CharacterSet({'\n'}).complement().copy()
|
||||
character({ '\n' }, false)
|
||||
},
|
||||
|
||||
{
|
||||
|
|
@ -170,6 +170,19 @@ describe("parsing regex patterns", []() {
|
|||
blank()
|
||||
})
|
||||
})
|
||||
},
|
||||
|
||||
{
|
||||
"choices containing negated character classes",
|
||||
"/([^/]|(\\\\/))*/",
|
||||
seq({
|
||||
character({ '/' }),
|
||||
repeat(choice({
|
||||
character({ '/' }, false),
|
||||
seq({ character({ '\\' }), character({ '/' }) }),
|
||||
})),
|
||||
character({ '/' }),
|
||||
}),
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -5,106 +5,327 @@ using namespace rules;
|
|||
|
||||
START_TEST
|
||||
|
||||
describe("character sets", []() {
|
||||
unsigned char max_char = 255;
|
||||
describe("CharacterSet", []() {
|
||||
describe("equality", [&]() {
|
||||
it("returns true for identical character sets", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
describe("computing the complement", [&]() {
|
||||
it("works for the set containing only the null character", [&]() {
|
||||
CharacterSet set1({ '\0' });
|
||||
auto set2 = set1.complement();
|
||||
AssertThat(set2, Equals(CharacterSet({
|
||||
{ 1, max_char }
|
||||
})));
|
||||
AssertThat(set2.complement(), Equals(set1));
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
AssertThat(set1, Equals(set2));
|
||||
});
|
||||
|
||||
it("works for single character sets", [&]() {
|
||||
CharacterSet set1({ 'b' });
|
||||
auto set2 = set1.complement();
|
||||
AssertThat(set2, Equals(CharacterSet({
|
||||
{ 0, 'a' },
|
||||
{ 'c', max_char },
|
||||
})));
|
||||
AssertThat(set2.complement(), Equals(set1));
|
||||
it("returns false for character sets that include different ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include('a', 'c')
|
||||
.include('f', 'm');
|
||||
|
||||
AssertThat(set1, !Equals(set2));
|
||||
AssertThat(set2, !Equals(set1));
|
||||
});
|
||||
|
||||
it("returns false for character sets that exclude different ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'd')
|
||||
.exclude('f', 'm');
|
||||
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'c')
|
||||
.exclude('f', 'm');
|
||||
|
||||
AssertThat(set1, !Equals(set2));
|
||||
AssertThat(set2, !Equals(set1));
|
||||
});
|
||||
|
||||
it("returns false for character sets with different sign", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include_all();
|
||||
CharacterSet set2 = CharacterSet();
|
||||
|
||||
AssertThat(set1, !Equals(set2));
|
||||
AssertThat(set2, !Equals(set1));
|
||||
});
|
||||
});
|
||||
|
||||
describe("computing unions", [&]() {
|
||||
it("works for disjoint sets", [&]() {
|
||||
CharacterSet set({ {'a', 'z'} });
|
||||
set.add_set(CharacterSet({ {'A', 'Z'} }));
|
||||
AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'} })));
|
||||
describe("hashing", [&]() {
|
||||
it("returns the same number for identical character sets", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
AssertThat(set1.hash_code(), Equals(set2.hash_code()));
|
||||
});
|
||||
|
||||
it("works for sets with adjacent ranges", [&]() {
|
||||
CharacterSet set({ CharacterRange('a', 'r') });
|
||||
set.add_set(CharacterSet({ CharacterRange('s', 'z') }));
|
||||
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
|
||||
it("returns different numbers for character sets that include different ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'd')
|
||||
.include('f', 'm');
|
||||
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include('a', 'c')
|
||||
.include('f', 'm');
|
||||
|
||||
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
|
||||
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
|
||||
});
|
||||
|
||||
it("becomes the complete set when the complement is added", [&]() {
|
||||
CharacterSet set({ 'c' });
|
||||
auto complement = set.complement();
|
||||
set.add_set(complement);
|
||||
AssertThat(set, Equals(CharacterSet({ {0, max_char} })));
|
||||
it("returns different numbers for character sets that exclude different ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'd')
|
||||
.exclude('f', 'm');
|
||||
|
||||
CharacterSet set2 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'c')
|
||||
.exclude('f', 'm');
|
||||
|
||||
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
|
||||
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
|
||||
});
|
||||
|
||||
it("works when the result becomes a continuous range", []() {
|
||||
CharacterSet set({ {'a', 'd'}, {'f', 'z'} });
|
||||
set.add_set(CharacterSet({ {'c', 'g'} }));
|
||||
AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
|
||||
});
|
||||
it("returns different numbers for character sets with different sign", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include_all();
|
||||
CharacterSet set2 = CharacterSet();
|
||||
|
||||
it("does nothing for the set of all characters", [&]() {
|
||||
CharacterSet set({ 'a' });
|
||||
set.add_set(set.complement());
|
||||
AssertThat(set, Equals(CharacterSet({ {'\0', max_char} })));
|
||||
AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
|
||||
AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
|
||||
});
|
||||
});
|
||||
|
||||
describe("subtracting sets", []() {
|
||||
describe("::is_empty", [&]() {
|
||||
it("returns true for empty character sets", [&]() {
|
||||
AssertThat(CharacterSet().is_empty(), Equals(true));
|
||||
});
|
||||
|
||||
it("returns false for full character sets", [&]() {
|
||||
AssertThat(CharacterSet().include_all().is_empty(), Equals(false));
|
||||
});
|
||||
|
||||
it("returns false for character sets that include some characters", [&]() {
|
||||
AssertThat(CharacterSet().include('x').is_empty(), Equals(false));
|
||||
});
|
||||
});
|
||||
|
||||
describe("::include", [&]() {
|
||||
describe("when the set has a whitelist of characters", [&]() {
|
||||
it("adds included characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include('a', 'd');
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('a')
|
||||
.include('b')
|
||||
.include('c')
|
||||
.include('d')));
|
||||
});
|
||||
});
|
||||
|
||||
describe("when the set has a blacklist of characters", [&]() {
|
||||
it("removes excluded characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'g')
|
||||
.include('c', 'e');
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a')
|
||||
.exclude('b')
|
||||
.exclude('f')
|
||||
.exclude('g')));
|
||||
});
|
||||
|
||||
it("does nothing if the character are already not excluded", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.include('a', 'c');
|
||||
AssertThat(set1, Equals(CharacterSet().include_all()));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("::exclude", [&]() {
|
||||
describe("when the set has a whitelist of characters", [&]() {
|
||||
it("removes included characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'g')
|
||||
.exclude('c', 'e');
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('a')
|
||||
.include('b')
|
||||
.include('f')
|
||||
.include('g')));
|
||||
});
|
||||
|
||||
it("does nothing if the character's are already not included", [&]() {
|
||||
CharacterSet set1 = CharacterSet().exclude('a', 'c');
|
||||
AssertThat(set1, Equals(CharacterSet()));
|
||||
});
|
||||
});
|
||||
|
||||
describe("when the set has a blacklist of characters", [&]() {
|
||||
it("removes excluded characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'd');
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a')
|
||||
.exclude('b')
|
||||
.exclude('c')
|
||||
.exclude('d')));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("::remove_set", []() {
|
||||
CharacterSet intersection;
|
||||
|
||||
it("works for disjoint sets", [&]() {
|
||||
CharacterSet set1({ {'a', 'z'} });
|
||||
intersection = set1.remove_set(CharacterSet({ {'A', 'Z'} }));
|
||||
AssertThat(set1, Equals(CharacterSet({ {'a', 'z'} })));
|
||||
AssertThat(intersection, Equals(CharacterSet()));
|
||||
describe("for a set with whitelisted characters", [&]() {
|
||||
describe("when the subtractend has whitelisted characters", [&]() {
|
||||
it("removes the included characters that the other set also includes", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include('a', 'z');
|
||||
set1.remove_set(CharacterSet().include('d', 's'));
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('a', 'c')
|
||||
.include('t', 'z')));
|
||||
});
|
||||
|
||||
it("returns the characters that were removed", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include('a', 'z');
|
||||
intersection = set1.remove_set(CharacterSet().include('d', 's'));
|
||||
AssertThat(intersection, Equals(CharacterSet()
|
||||
.include('d', 's')));
|
||||
});
|
||||
|
||||
it("returns the empty set when the sets are disjoint", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include('a', 'z');
|
||||
intersection = set1.remove_set(CharacterSet().include('A', 'Z'));
|
||||
AssertThat(set1, Equals(CharacterSet().include('a', 'z')));
|
||||
AssertThat(intersection, Equals(CharacterSet()));
|
||||
});
|
||||
});
|
||||
|
||||
describe("when the subtractend has blacklisted characters", [&]() {
|
||||
it("removes the included characters that are not excluded by the other set", [&]() {
|
||||
CharacterSet set1 = CharacterSet().include('a', 'f');
|
||||
|
||||
intersection = set1.remove_set(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('d', 'z'));
|
||||
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('d', 'f')));
|
||||
AssertThat(intersection, Equals(CharacterSet()
|
||||
.include('a', 'c')));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("works when one set is a proper subset of the other", [&]() {
|
||||
CharacterSet set1({ {'a','z'} });
|
||||
intersection = set1.remove_set(CharacterSet({ {'d', 's'} }));
|
||||
AssertThat(set1, Equals(CharacterSet({ {'a', 'c'}, {'t', 'z'} })));
|
||||
AssertThat(intersection, Equals(CharacterSet({ {'d', 's'} })));
|
||||
describe("for a set with blacklisted characters", [&]() {
|
||||
describe("when the subtractend has whitelisted characters", [&]() {
|
||||
it("adds the subtractend's inclusions to the receiver's exclusions", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'f');
|
||||
|
||||
intersection = set1.remove_set(CharacterSet()
|
||||
.include('x', 'z'));
|
||||
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'f')
|
||||
.exclude('x', 'z')));
|
||||
|
||||
AssertThat(intersection, Equals(CharacterSet().include('x', 'z')));
|
||||
});
|
||||
});
|
||||
|
||||
describe("when the subtractend has blacklisted characters", [&]() {
|
||||
it("includes only the characters excluded by the subtractend but not by the receiver", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'm');
|
||||
|
||||
set1.remove_set(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('d', 'z'));
|
||||
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('n', 'z')));
|
||||
});
|
||||
|
||||
it("returns the characters excluded by neither set", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'm');
|
||||
|
||||
intersection = set1.remove_set(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('d', 'z'));
|
||||
|
||||
AssertThat(intersection, Equals(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'z')));
|
||||
});
|
||||
|
||||
it("works when the sets are disjoint", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'm');
|
||||
|
||||
intersection = set1.remove_set(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('d', 'z'));
|
||||
|
||||
AssertThat(set1, Equals(CharacterSet()
|
||||
.include('n', 'z')));
|
||||
|
||||
AssertThat(intersection, Equals(CharacterSet()
|
||||
.include_all()
|
||||
.exclude('a', 'z')));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("::included_ranges", [&]() {
|
||||
it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'c')
|
||||
.include('g')
|
||||
.include('z');
|
||||
|
||||
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
|
||||
CharacterRange('a', 'c'),
|
||||
CharacterRange('g'),
|
||||
CharacterRange('z'),
|
||||
})));
|
||||
});
|
||||
|
||||
it("works for a set that overlaps the right side", [&]() {
|
||||
CharacterSet set1({ {'a','s'} });
|
||||
intersection = set1.remove_set(CharacterSet({ {'m', 'z'} }));
|
||||
AssertThat(set1, Equals(CharacterSet({ {'a', 'l'} })));
|
||||
AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
|
||||
});
|
||||
it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'b')
|
||||
.include('g')
|
||||
.include('z');
|
||||
|
||||
it("works for a set that overlaps the left side", [&]() {
|
||||
CharacterSet set2({ {'m','z'} });
|
||||
intersection = set2.remove_set(CharacterSet({ {'a', 's'} }));
|
||||
AssertThat(set2, Equals(CharacterSet({ {'t', 'z'} })));
|
||||
AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
|
||||
});
|
||||
|
||||
it("works for sets with multiple ranges", [&]() {
|
||||
CharacterSet set1({ {'a', 'd'}, {'m', 'z'} });
|
||||
intersection = set1.remove_set(CharacterSet({ {'c', 'o'}, {'s', 'x'} }));
|
||||
AssertThat(set1, Equals(CharacterSet({ {'a', 'b'}, {'p', 'r'}, {'y', 'z'} })));
|
||||
AssertThat(intersection, Equals(CharacterSet({ {'c', 'd'}, {'m', 'o'}, {'s', 'x'} })));
|
||||
});
|
||||
|
||||
it("works when the result is empty", [&]() {
|
||||
CharacterSet set1({ 'd' });
|
||||
intersection = set1.remove_set(CharacterSet({ 'a', 'd', 'x' }));
|
||||
AssertThat(set1, Equals(CharacterSet()));
|
||||
AssertThat(intersection, Equals(CharacterSet({ 'd' })));
|
||||
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
|
||||
CharacterRange('a'),
|
||||
CharacterRange('b'),
|
||||
CharacterRange('g'),
|
||||
CharacterRange('z'),
|
||||
})));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -38,8 +38,8 @@ class LexTableBuilder {
|
|||
if (symbol == rules::ERROR())
|
||||
continue;
|
||||
else if (symbol == rules::END_OF_INPUT())
|
||||
result.insert(
|
||||
LexItem(symbol, after_separators(CharacterSet({ 0 }).copy())));
|
||||
result.insert(LexItem(
|
||||
symbol, after_separators(CharacterSet().include(0).copy())));
|
||||
else if (symbol.is_token())
|
||||
result.insert(
|
||||
LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
|
||||
|
|
@ -52,9 +52,11 @@ class LexTableBuilder {
|
|||
if (pair == lex_state_ids.end()) {
|
||||
LexStateId state_id = lex_table.add_state();
|
||||
lex_state_ids[item_set] = state_id;
|
||||
|
||||
add_accept_token_actions(item_set, state_id);
|
||||
add_advance_actions(item_set, state_id);
|
||||
add_token_start(item_set, state_id);
|
||||
|
||||
return state_id;
|
||||
} else {
|
||||
return pair->second;
|
||||
|
|
@ -100,10 +102,10 @@ class LexTableBuilder {
|
|||
}
|
||||
|
||||
CharacterSet separator_set() const {
|
||||
set<rules::CharacterRange> ranges;
|
||||
CharacterSet result;
|
||||
for (char c : lex_grammar.separators)
|
||||
ranges.insert(c);
|
||||
return CharacterSet(ranges);
|
||||
result.include(c);
|
||||
return result;
|
||||
}
|
||||
|
||||
rules::rule_ptr after_separators(rules::rule_ptr rule) {
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ void merge_char_transition(std::map<rules::CharacterSet, T> *left,
|
|||
left->insert(pairs_to_insert.begin(), pairs_to_insert.end());
|
||||
|
||||
if (!new_char_set.is_empty())
|
||||
left->insert({ new_char_set, new_pair.second });
|
||||
left->insert({ new_char_set, new_value });
|
||||
}
|
||||
|
||||
} // namespace build_tables
|
||||
|
|
|
|||
|
|
@ -245,12 +245,13 @@ class CCodeGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
void condition_for_character_set(const rules::CharacterSet &set) {
|
||||
if (set.ranges.size() == 1) {
|
||||
add(condition_for_character_range(*set.ranges.begin()));
|
||||
void condition_for_character_ranges(
|
||||
const vector<rules::CharacterRange> &ranges) {
|
||||
if (ranges.size() == 1) {
|
||||
add(condition_for_character_range(*ranges.begin()));
|
||||
} else {
|
||||
bool first = true;
|
||||
for (auto &match : set.ranges) {
|
||||
for (auto &match : ranges) {
|
||||
string part = "(" + condition_for_character_range(match) + ")";
|
||||
if (first) {
|
||||
add(part);
|
||||
|
|
@ -263,15 +264,13 @@ class CCodeGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
void condition_for_character_rule(const rules::CharacterSet &rule) {
|
||||
pair<rules::CharacterSet, bool> representation =
|
||||
rule.most_compact_representation();
|
||||
if (representation.second) {
|
||||
condition_for_character_set(representation.first);
|
||||
} else {
|
||||
void condition_for_character_set(const rules::CharacterSet &rule) {
|
||||
if (rule.includes_all) {
|
||||
add("!(");
|
||||
condition_for_character_set(rule.complement());
|
||||
condition_for_character_ranges(rule.excluded_ranges());
|
||||
add(")");
|
||||
} else {
|
||||
condition_for_character_ranges(rule.included_ranges());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -319,7 +318,7 @@ class CCodeGenerator {
|
|||
line("START_TOKEN();");
|
||||
for (auto pair : lex_state.actions)
|
||||
if (!pair.first.is_empty())
|
||||
_if([&]() { condition_for_character_rule(pair.first); },
|
||||
_if([&]() { condition_for_character_set(pair.first); },
|
||||
[&]() { code_for_lex_actions(pair.second, expected_inputs); });
|
||||
code_for_lex_actions(lex_state.default_action, expected_inputs);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ class ExpandTokens : public rules::IdentityRuleFn {
|
|||
rule_ptr apply_to(const String *rule) {
|
||||
vector<rule_ptr> elements;
|
||||
for (char val : rule->value)
|
||||
elements.push_back(rules::CharacterSet({ val }).copy());
|
||||
elements.push_back(rules::CharacterSet().include(val).copy());
|
||||
return rules::Seq::Build(elements);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ class PatternParser {
|
|||
case ']': { return error("unmatched close square bracket"); }
|
||||
case '.': {
|
||||
next();
|
||||
return { CharacterSet({ '\n' }).complement().copy(), nullptr };
|
||||
return { CharacterSet().include_all().exclude('\n').copy(), nullptr };
|
||||
}
|
||||
default: {
|
||||
auto pair = single_char();
|
||||
|
|
@ -127,20 +127,24 @@ class PatternParser {
|
|||
}
|
||||
|
||||
pair<CharacterSet, const GrammarError *> char_set() {
|
||||
CharacterSet result;
|
||||
bool is_affirmative = true;
|
||||
if (peek() == '^') {
|
||||
next();
|
||||
is_affirmative = false;
|
||||
result.include_all();
|
||||
}
|
||||
CharacterSet result;
|
||||
|
||||
while (has_more_input() && (peek() != ']')) {
|
||||
auto pair = single_char();
|
||||
if (pair.second)
|
||||
return { CharacterSet(), pair.second };
|
||||
result.add_set(pair.first);
|
||||
if (is_affirmative)
|
||||
result.add_set(pair.first);
|
||||
else
|
||||
result.remove_set(pair.first);
|
||||
}
|
||||
if (!is_affirmative)
|
||||
result = result.complement();
|
||||
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
|
|
@ -157,10 +161,10 @@ class PatternParser {
|
|||
next();
|
||||
if (peek() == '-') {
|
||||
next();
|
||||
value = CharacterSet({ CharacterRange(first_char, peek()) });
|
||||
value = CharacterSet().include(first_char, peek());
|
||||
next();
|
||||
} else {
|
||||
value = CharacterSet({ first_char });
|
||||
value = CharacterSet().include(first_char);
|
||||
}
|
||||
}
|
||||
return { value, nullptr };
|
||||
|
|
@ -169,19 +173,20 @@ class PatternParser {
|
|||
CharacterSet escaped_char(char value) {
|
||||
switch (value) {
|
||||
case 'a':
|
||||
return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' } });
|
||||
return CharacterSet().include('a', 'z').include('A', 'Z');
|
||||
case 'w':
|
||||
return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' }, { '0', '9' } });
|
||||
return CharacterSet().include('a', 'z').include('A', 'Z').include('0',
|
||||
'9');
|
||||
case 'd':
|
||||
return CharacterSet({ { '0', '9' } });
|
||||
return CharacterSet().include('0', '9');
|
||||
case 't':
|
||||
return CharacterSet({ '\t' });
|
||||
return CharacterSet().include('\t');
|
||||
case 'n':
|
||||
return CharacterSet({ '\n' });
|
||||
return CharacterSet().include('\n');
|
||||
case 'r':
|
||||
return CharacterSet({ '\r' });
|
||||
return CharacterSet().include('\r');
|
||||
default:
|
||||
return CharacterSet({ value });
|
||||
return CharacterSet().include(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
||||
using std::ostream;
|
||||
using std::string;
|
||||
|
||||
static const unsigned char MAX_CHAR = -1;
|
||||
|
|
@ -53,5 +54,9 @@ string CharacterRange::to_string() const {
|
|||
return string() + escape_character(min) + "-" + escape_character(max);
|
||||
}
|
||||
|
||||
ostream &operator<<(ostream &stream, const CharacterRange &range) {
|
||||
return stream << range.to_string();
|
||||
}
|
||||
|
||||
} // namespace rules
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ struct CharacterRange {
|
|||
std::string to_string() const;
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &stream, const CharacterRange &rule);
|
||||
|
||||
} // namespace rules
|
||||
} // namespace tree_sitter
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "compiler/rules/character_set.h"
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "compiler/rules/visitor.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
|
@ -9,32 +10,87 @@ namespace rules {
|
|||
using std::string;
|
||||
using std::hash;
|
||||
using std::set;
|
||||
using std::pair;
|
||||
using std::initializer_list;
|
||||
using std::vector;
|
||||
|
||||
static const unsigned char MAX_CHAR = -1;
|
||||
static void add_range(set<uint32_t> *characters, CharacterRange range) {
|
||||
for (uint32_t c = range.min; c <= range.max; c++)
|
||||
characters->insert(c);
|
||||
}
|
||||
|
||||
CharacterSet::CharacterSet() : ranges({}) {}
|
||||
CharacterSet::CharacterSet(const set<CharacterRange> &ranges)
|
||||
: ranges(ranges) {}
|
||||
CharacterSet::CharacterSet(const initializer_list<CharacterRange> &ranges)
|
||||
: ranges(ranges) {}
|
||||
static void remove_range(set<uint32_t> *characters, CharacterRange range) {
|
||||
for (uint32_t c = range.min; c <= range.max; c++)
|
||||
characters->erase(c);
|
||||
}
|
||||
|
||||
static set<uint32_t> remove_chars(set<uint32_t> *left,
|
||||
const set<uint32_t> &right) {
|
||||
set<uint32_t> result;
|
||||
for (uint32_t c : right) {
|
||||
if (left->erase(c))
|
||||
result.insert(c);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static set<uint32_t> add_chars(set<uint32_t> *left,
|
||||
const set<uint32_t> &right) {
|
||||
set<uint32_t> result;
|
||||
for (uint32_t c : right)
|
||||
if (left->insert(c).second)
|
||||
result.insert(c);
|
||||
return result;
|
||||
}
|
||||
|
||||
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &chars) {
|
||||
vector<CharacterRange> result;
|
||||
for (uint32_t c : chars) {
|
||||
size_t size = result.size();
|
||||
if (size >= 2 && result[size - 2].max == (c - 2)) {
|
||||
result.pop_back();
|
||||
result.back().max = c;
|
||||
} else if (size >= 1) {
|
||||
CharacterRange &last = result.back();
|
||||
if (last.min < last.max && last.max == (c - 1))
|
||||
last.max = c;
|
||||
else
|
||||
result.push_back(c);
|
||||
} else {
|
||||
result.push_back(c);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
CharacterSet::CharacterSet()
|
||||
: includes_all(false), included_chars({}), excluded_chars({}) {}
|
||||
|
||||
bool CharacterSet::operator==(const Rule &rule) const {
|
||||
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
|
||||
return other && (ranges == other->ranges);
|
||||
return other && (includes_all == other->includes_all) &&
|
||||
(included_chars == other->included_chars) &&
|
||||
(excluded_chars == other->excluded_chars);
|
||||
}
|
||||
|
||||
bool CharacterSet::operator<(const CharacterSet &other) const {
|
||||
return ranges < other.ranges;
|
||||
if (!includes_all && other.includes_all)
|
||||
return true;
|
||||
if (includes_all && !other.includes_all)
|
||||
return false;
|
||||
if (included_chars < other.included_chars)
|
||||
return true;
|
||||
if (other.included_chars < included_chars)
|
||||
return false;
|
||||
return excluded_chars < other.excluded_chars;
|
||||
}
|
||||
|
||||
size_t CharacterSet::hash_code() const {
|
||||
size_t result = std::hash<size_t>()(ranges.size());
|
||||
for (auto &range : ranges) {
|
||||
result ^= std::hash<unsigned char>()(range.min);
|
||||
result ^= std::hash<unsigned char>()(range.max);
|
||||
}
|
||||
size_t result = hash<bool>()(includes_all);
|
||||
result ^= hash<size_t>()(included_chars.size());
|
||||
for (auto &c : included_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
result ^= hash<size_t>()(excluded_chars.size());
|
||||
for (auto &c : excluded_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -44,97 +100,88 @@ rule_ptr CharacterSet::copy() const {
|
|||
|
||||
string CharacterSet::to_string() const {
|
||||
string result("(char");
|
||||
for (auto &range : ranges)
|
||||
result += " " + range.to_string();
|
||||
if (includes_all)
|
||||
result += " include_all";
|
||||
if (!included_chars.empty()) {
|
||||
result += " (include";
|
||||
for (auto r : included_ranges())
|
||||
result += string(" ") + r.to_string();
|
||||
result += ")";
|
||||
}
|
||||
if (!excluded_chars.empty()) {
|
||||
result += " (exclude";
|
||||
for (auto r : excluded_ranges())
|
||||
result += string(" ") + r.to_string();
|
||||
result += ")";
|
||||
}
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
CharacterSet CharacterSet::complement() const {
|
||||
CharacterSet result({ { 0, MAX_CHAR } });
|
||||
result.remove_set(*this);
|
||||
return result;
|
||||
CharacterSet &CharacterSet::include_all() {
|
||||
includes_all = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::pair<CharacterSet, bool> CharacterSet::most_compact_representation()
|
||||
const {
|
||||
auto first_range = *ranges.begin();
|
||||
if (first_range.min == 0 && first_range.max > 0) {
|
||||
return { this->complement(), false };
|
||||
} else {
|
||||
return { *this, true };
|
||||
}
|
||||
CharacterSet &CharacterSet::include(uint32_t min, uint32_t max) {
|
||||
if (includes_all)
|
||||
remove_range(&excluded_chars, CharacterRange(min, max));
|
||||
else
|
||||
add_range(&included_chars, CharacterRange(min, max));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void add_range(CharacterSet *self, CharacterRange addition) {
|
||||
set<CharacterRange> new_ranges;
|
||||
for (auto range : self->ranges) {
|
||||
bool is_adjacent = false;
|
||||
if (range.min < addition.min && range.max >= addition.min - 1) {
|
||||
is_adjacent = true;
|
||||
addition.min = range.min;
|
||||
}
|
||||
if (range.max > addition.max && range.min <= addition.max + 1) {
|
||||
is_adjacent = true;
|
||||
addition.max = range.max;
|
||||
}
|
||||
if (!is_adjacent) {
|
||||
new_ranges.insert(range);
|
||||
}
|
||||
}
|
||||
new_ranges.insert(addition);
|
||||
self->ranges = new_ranges;
|
||||
CharacterSet &CharacterSet::exclude(uint32_t min, uint32_t max) {
|
||||
if (includes_all)
|
||||
add_range(&excluded_chars, CharacterRange(min, max));
|
||||
else
|
||||
remove_range(&included_chars, CharacterRange(min, max));
|
||||
return *this;
|
||||
}
|
||||
|
||||
CharacterSet remove_range(CharacterSet *self, CharacterRange range_to_remove) {
|
||||
CharacterSet removed_set;
|
||||
set<CharacterRange> new_ranges;
|
||||
for (auto range : self->ranges) {
|
||||
if (range_to_remove.min <= range.min) {
|
||||
if (range_to_remove.max < range.min) {
|
||||
new_ranges.insert(range);
|
||||
} else if (range_to_remove.max < range.max) {
|
||||
new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
|
||||
add_range(&removed_set, CharacterRange(range.min, range_to_remove.max));
|
||||
} else {
|
||||
add_range(&removed_set, range);
|
||||
}
|
||||
} else if (range_to_remove.min <= range.max) {
|
||||
if (range_to_remove.max < range.max) {
|
||||
new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
|
||||
new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
|
||||
add_range(&removed_set, range_to_remove);
|
||||
} else {
|
||||
new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
|
||||
add_range(&removed_set, CharacterRange(range_to_remove.min, range.max));
|
||||
}
|
||||
} else {
|
||||
new_ranges.insert(range);
|
||||
}
|
||||
}
|
||||
self->ranges = new_ranges;
|
||||
return removed_set;
|
||||
}
|
||||
CharacterSet &CharacterSet::include(uint32_t c) { return include(c, c); }
|
||||
|
||||
bool CharacterSet::is_empty() const { return ranges.empty(); }
|
||||
CharacterSet &CharacterSet::exclude(uint32_t c) { return exclude(c, c); }
|
||||
|
||||
bool CharacterSet::is_empty() const {
|
||||
return !includes_all && included_chars.empty();
|
||||
}
|
||||
|
||||
void CharacterSet::add_set(const CharacterSet &other) {
|
||||
for (auto &other_range : other.ranges) {
|
||||
add_range(this, other_range);
|
||||
}
|
||||
for (uint32_t c : other.included_chars)
|
||||
included_chars.insert(c);
|
||||
}
|
||||
|
||||
CharacterSet CharacterSet::remove_set(const CharacterSet &other) {
|
||||
CharacterSet result;
|
||||
for (auto &other_range : other.ranges) {
|
||||
auto removed_set = remove_range(this, other_range);
|
||||
result.add_set(removed_set);
|
||||
if (includes_all) {
|
||||
if (other.includes_all) {
|
||||
result.includes_all = true;
|
||||
result.excluded_chars = excluded_chars;
|
||||
included_chars = add_chars(&result.excluded_chars, other.excluded_chars);
|
||||
excluded_chars = {};
|
||||
includes_all = false;
|
||||
} else {
|
||||
result.included_chars = add_chars(&excluded_chars, other.included_chars);
|
||||
}
|
||||
} else {
|
||||
if (other.includes_all) {
|
||||
result.included_chars = included_chars;
|
||||
included_chars =
|
||||
remove_chars(&result.included_chars, other.excluded_chars);
|
||||
} else {
|
||||
result.included_chars =
|
||||
remove_chars(&included_chars, other.included_chars);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
CharacterSet CharacterSet::intersect(const CharacterSet &set) const {
|
||||
CharacterSet copy = *this;
|
||||
return copy.remove_set(set);
|
||||
vector<CharacterRange> CharacterSet::included_ranges() const {
|
||||
return consolidate_ranges(included_chars);
|
||||
}
|
||||
|
||||
vector<CharacterRange> CharacterSet::excluded_ranges() const {
|
||||
return consolidate_ranges(excluded_chars);
|
||||
}
|
||||
|
||||
void CharacterSet::accept(Visitor *visitor) const { visitor->visit(this); }
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
#ifndef COMPILER_RULES_CHARACTER_SET_H_
|
||||
#define COMPILER_RULES_CHARACTER_SET_H_
|
||||
|
||||
#include <initializer_list>
|
||||
#include <set>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "compiler/rules/rule.h"
|
||||
#include "compiler/rules/character_range.h"
|
||||
|
||||
|
|
@ -14,8 +15,12 @@ namespace rules {
|
|||
class CharacterSet : public Rule {
|
||||
public:
|
||||
CharacterSet();
|
||||
explicit CharacterSet(const std::set<CharacterRange> &ranges);
|
||||
explicit CharacterSet(const std::initializer_list<CharacterRange> &ranges);
|
||||
|
||||
CharacterSet &include_all();
|
||||
CharacterSet &include(uint32_t c);
|
||||
CharacterSet &include(uint32_t min, uint32_t max);
|
||||
CharacterSet &exclude(uint32_t c);
|
||||
CharacterSet &exclude(uint32_t min, uint32_t max);
|
||||
|
||||
bool operator==(const Rule &other) const;
|
||||
bool operator<(const CharacterSet &) const;
|
||||
|
|
@ -26,12 +31,14 @@ class CharacterSet : public Rule {
|
|||
|
||||
void add_set(const CharacterSet &other);
|
||||
CharacterSet remove_set(const CharacterSet &other);
|
||||
CharacterSet complement() const;
|
||||
CharacterSet intersect(const CharacterSet &) const;
|
||||
std::pair<CharacterSet, bool> most_compact_representation() const;
|
||||
bool is_empty() const;
|
||||
|
||||
std::set<CharacterRange> ranges;
|
||||
std::vector<CharacterRange> included_ranges() const;
|
||||
std::vector<CharacterRange> excluded_ranges() const;
|
||||
|
||||
bool includes_all;
|
||||
std::set<uint32_t> included_chars;
|
||||
std::set<uint32_t> excluded_chars;
|
||||
};
|
||||
|
||||
} // namespace rules
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue