In compiler, distinguish between anonymous tokens and hidden rules

This commit is contained in:
Max Brunsfeld 2015-09-05 17:05:37 -07:00
parent 4b270c8604
commit 5982b77c97
46 changed files with 41131 additions and 40884 deletions

View file

@ -1,8 +1,7 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/build_parse_table.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/built_in_symbols.h"
using namespace rules;
@ -12,15 +11,35 @@ START_TEST
describe("build_parse_table", []() {
SyntaxGrammar parse_grammar{{
{ "rule0", choice({ i_sym(1), i_sym(2) }) },
{ "rule1", i_token(0) },
{ "rule2", i_token(1) },
}, {}, { Symbol(2, SymbolOptionToken) }, {}};
{
"rule0",
choice({ i_sym(1), i_sym(2) }),
RuleEntryTypeNamed
},
{
"rule1",
i_token(0),
RuleEntryTypeNamed
},
{
"rule2",
i_token(1),
RuleEntryTypeNamed
},
}, { Symbol(2, true) }, {}};
LexicalGrammar lex_grammar{{
{ "token0", pattern("[a-c]") },
{ "token1", pattern("[b-d]") },
}, {}, {}};
{
"token0",
pattern("[a-c]"),
RuleEntryTypeNamed
},
{
"token1",
pattern("[b-d]"),
RuleEntryTypeNamed
},
}, {}};
it("first looks for the start rule and its item set closure", [&]() {
auto result = build_parse_table(parse_grammar, lex_grammar);
@ -32,11 +51,11 @@ describe("build_parse_table", []() {
// expanded from the item set closure of the start item
{ Symbol(1), {ParseAction::Shift(2, { 0 })} },
{ Symbol(2), {ParseAction::Shift(2, { 0 })} },
{ Symbol(0, SymbolOptionToken), {ParseAction::Shift(3, { 0 })} },
{ Symbol(1, SymbolOptionToken), {ParseAction::Shift(4, { 0 })} },
{ Symbol(0, true), {ParseAction::Shift(3, { 0 })} },
{ Symbol(1, true), {ParseAction::Shift(4, { 0 })} },
// for the ubiquitous_token 'token2'
{ Symbol(2, SymbolOptionToken), {ParseAction::ShiftExtra()} },
{ Symbol(2, true), {ParseAction::ShiftExtra()} },
})));
});
@ -52,7 +71,7 @@ describe("build_parse_table", []() {
{ END_OF_INPUT(), {ParseAction::Accept()} },
// for the ubiquitous_token 'token2'
{ Symbol(2, SymbolOptionToken), {ParseAction::ShiftExtra()} },
{ Symbol(2, true), {ParseAction::ShiftExtra()} },
})));
});
@ -63,7 +82,7 @@ describe("build_parse_table", []() {
{ END_OF_INPUT(), {ParseAction::Reduce(Symbol(0), 1, 0, AssociativityLeft, 0)} },
// for the ubiquitous_token 'token2'
{ Symbol(2, SymbolOptionToken), {ParseAction::ShiftExtra()} },
{ Symbol(2, true), {ParseAction::ShiftExtra()} },
})));
});
});

View file

@ -1,5 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/build_tables/first_symbols.h"
#include "compiler/rules/metadata.h"
@ -16,7 +16,7 @@ describe("first_symbols", []() {
auto rule = seq({ i_token(0), i_token(1) });
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(0, SymbolOptionToken),
Symbol(0, true),
})));
});
@ -28,8 +28,8 @@ describe("first_symbols", []() {
i_token(1) });
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(0, SymbolOptionToken),
Symbol(1, SymbolOptionToken)
Symbol(0, true),
Symbol(1, true)
})));
});
@ -41,16 +41,21 @@ describe("first_symbols", []() {
i_sym(0) });
SyntaxGrammar grammar{{
{ "rule0", seq({
i_token(2),
i_token(3),
i_token(4) }) }
}, {}, {}, {}};
{
"rule0",
seq({
i_token(2),
i_token(3),
i_token(4),
}),
RuleEntryTypeNamed
}
}, {}, {}};
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(0, SymbolOptionToken),
Symbol(2, SymbolOptionToken),
Symbol(0, true),
Symbol(2, true),
})));
});
@ -60,15 +65,20 @@ describe("first_symbols", []() {
i_token(1) });
SyntaxGrammar grammar{{
{ "rule0", choice({
i_token(0),
blank() }) }
}, {}, {}, {}};
{
"rule0",
choice({
i_token(0),
blank(),
}),
RuleEntryTypeNamed
},
}, {}, {}};
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(0, SymbolOptionToken),
Symbol(1, SymbolOptionToken),
Symbol(0, true),
Symbol(1, true),
})));
});
});
@ -76,17 +86,21 @@ describe("first_symbols", []() {
describe("when there are left-recursive rules", [&]() {
it("terminates", [&]() {
SyntaxGrammar grammar{{
{ "rule0", choice({
seq({ i_sym(0), i_token(10) }),
i_token(11),
}) },
}, {}, {}, {}};
{
"rule0",
choice({
seq({ i_sym(0), i_token(10) }),
i_token(11),
}),
RuleEntryTypeNamed
},
}, {}, {}};
auto rule = i_sym(0);
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(11, SymbolOptionToken)
Symbol(11, true)
})));
});
});
@ -95,7 +109,7 @@ describe("first_symbols", []() {
auto rule = make_shared<Metadata>(i_token(3), map<rules::MetadataKey, int>());
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(3, SymbolOptionToken),
Symbol(3, true),
})));
});
});

View file

@ -1,5 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/build_tables/item_set_closure.h"
#include "compiler/build_tables/item_set_transitions.h"
@ -10,29 +10,39 @@ START_TEST
describe("item_set_closure", []() {
SyntaxGrammar grammar{{
{ "E", seq({
i_sym(1),
i_token(11) }) },
{ "T", seq({
i_token(12),
i_token(13) }) },
}, {}, {}, {}};
{
"E",
seq({
i_sym(1),
i_token(11),
}),
RuleEntryTypeNamed,
},
{
"T",
seq({
i_token(12),
i_token(13),
}),
RuleEntryTypeNamed,
},
}, {}, {}};
it("adds items at the beginnings of referenced rules", [&]() {
ParseItemSet item_set = item_set_closure(
ParseItem(Symbol(0), grammar.rule(Symbol(0)), {}),
set<Symbol>({ Symbol(10, SymbolOptionToken) }),
ParseItem(Symbol(0), grammar.rules[0].rule, {}),
set<Symbol>({ Symbol(10, true) }),
grammar
);
AssertThat(item_set, Equals(ParseItemSet({
{
ParseItem(Symbol(1), grammar.rule(Symbol(1)), {}),
set<Symbol>({ Symbol(11, SymbolOptionToken) }),
ParseItem(Symbol(1), grammar.rules[1].rule, {}),
set<Symbol>({ Symbol(11, true) }),
},
{
ParseItem(Symbol(0), grammar.rule(Symbol(0)), {}),
set<Symbol>({ Symbol(10, SymbolOptionToken) }),
ParseItem(Symbol(0), grammar.rules[0].rule, {}),
set<Symbol>({ Symbol(10, true) }),
},
})));
});

View file

@ -1,6 +1,6 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/item_set_transitions.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/helpers/rule_helpers.h"
using namespace rules;
@ -43,29 +43,37 @@ describe("char_transitions(LexItemSet)", []() {
describe("sym_transitions(ParseItemSet, SyntaxGrammar)", [&]() {
SyntaxGrammar grammar{{
{ "A", blank() },
{ "B", i_token(21) },
}, {}, {}, {}};
{
"A",
blank(),
RuleEntryTypeNamed
},
{
"B",
i_token(21),
RuleEntryTypeNamed
},
}, {}, {}};
it("computes the closure of the new item sets", [&]() {
ParseItemSet set1({
{
ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), { Symbol(101) }),
set<Symbol>({ Symbol(23, SymbolOptionToken) })
set<Symbol>({ Symbol(23, true) })
},
});
AssertThat(sym_transitions(set1, grammar), Equals(map<Symbol, ParseItemSet>({
{
Symbol(22, SymbolOptionToken),
Symbol(22, true),
ParseItemSet({
{
ParseItem(Symbol(0), i_sym(1), { Symbol(101), Symbol(22) }),
set<Symbol>({ Symbol(23, SymbolOptionToken) }),
set<Symbol>({ Symbol(23, true) }),
},
{
ParseItem(Symbol(1), i_token(21), {}),
set<Symbol>({ Symbol(23, SymbolOptionToken) })
set<Symbol>({ Symbol(23, true) })
},
})
},

View file

@ -2,7 +2,7 @@
#include "compiler/rules/built_in_symbols.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using namespace build_tables;
@ -11,16 +11,24 @@ START_TEST
describe("LexConflictManager", []() {
LexicalGrammar lexical_grammar{{
{ "other_token", pattern("[a-b]") },
{ "lookahead_token", pattern("[c-d]") },
}, {}, {}};
{
"other_token",
pattern("[a-b]"),
RuleEntryTypeNamed
},
{
"lookahead_token",
pattern("[c-d]"),
RuleEntryTypeNamed
},
}, {}};
LexConflictManager conflict_manager(lexical_grammar);
bool update;
Symbol sym1(0, SymbolOptionToken);
Symbol sym2(1, SymbolOptionToken);
Symbol sym3(2, SymbolOptionToken);
Symbol sym1(0, true);
Symbol sym2(1, true);
Symbol sym3(2, true);
it("favors non-errors over lexical errors", [&]() {
update = conflict_manager.resolve(LexAction::Advance(2, {0}), LexAction::Error());

View file

@ -2,7 +2,7 @@
#include "compiler/rules/built_in_symbols.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/parse_conflict_manager.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using namespace build_tables;
@ -11,17 +11,37 @@ START_TEST
describe("ParseConflictManager", []() {
SyntaxGrammar syntax_grammar{{
{ "in_progress_rule1", i_token(0) },
{ "in_progress_rule2", i_token(0) },
{ "reduced_rule", i_token(0) },
{ "other_rule1", i_token(0) },
{ "other_rule2", i_token(0) },
}, {}, { Symbol(2, SymbolOptionToken) }, {}};
{
"in_progress_rule1",
i_token(0),
RuleEntryTypeNamed,
},
{
"in_progress_rule2",
i_token(0),
RuleEntryTypeNamed,
},
{
"reduced_rule",
i_token(0),
RuleEntryTypeNamed,
},
{
"other_rule1",
i_token(0),
RuleEntryTypeNamed,
},
{
"other_rule2",
i_token(0),
RuleEntryTypeNamed,
},
}, { Symbol(2, true) }, {}};
pair<bool, ConflictType> result;
Symbol sym1(0);
Symbol sym2(1);
Symbol lookahead_sym(1, SymbolOptionToken);
Symbol lookahead_sym(1, true);
ParseConflictManager *conflict_manager;
before_each([&]() {

View file

@ -1,7 +1,7 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/rules/metadata.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using build_tables::rule_can_be_blank;
@ -57,13 +57,23 @@ describe("rule_can_be_blank", [&]() {
describe("checking recursively (by expanding non-terminals)", [&]() {
SyntaxGrammar grammar{{
{ "A", choice({
seq({ i_sym(0), i_token(11) }),
blank() }) },
{ "B", choice({
seq({ i_sym(1), i_token(12) }),
i_token(13) }) },
}, {}, {}, {}};
{
"A",
choice({
seq({ i_sym(0), i_token(11) }),
blank()
}),
RuleEntryTypeNamed,
},
{
"B",
choice({
seq({ i_sym(1), i_token(12) }),
i_token(13)
}),
RuleEntryTypeNamed,
},
}, {}, {}};
it("terminates for left-recursive rules that can be blank", [&]() {
rule = i_sym(0);

View file

@ -48,6 +48,20 @@ class rule_list : public vector<pair<string, rule_ptr>> {
vector<pair<string, rule_ptr>>(list) {}
};
template<typename T>
class eq_vector : public vector<T> {
public:
bool operator==(const vector<T> &other) const {
if (this->size() != other.size()) return false;
for (size_t i = 0; i < this->size(); i++)
if (!(this->operator[](i) == other[i]))
return false;
return true;
}
eq_vector(const initializer_list<T> &list) : vector<T>(list) {}
};
class rule_vector : public vector<rule_ptr> {
public:
bool operator==(const vector<rule_ptr> &other) const {

View file

@ -6,43 +6,41 @@ namespace tree_sitter {
using std::make_shared;
using std::set;
using std::map;
using std::ostream;
using std::string;
using std::to_string;
namespace rules {
rule_ptr character(const set<uint32_t> &ranges) {
return character(ranges, true);
}
rule_ptr character(const set<uint32_t> &ranges) {
return character(ranges, true);
}
rule_ptr character(const set<uint32_t> &chars, bool sign) {
CharacterSet result;
if (sign) {
for (uint32_t c : chars)
result.include(c);
} else {
result.include_all();
for (uint32_t c : chars)
result.exclude(c);
}
return result.copy();
rule_ptr character(const set<uint32_t> &chars, bool sign) {
rules::CharacterSet result;
if (sign) {
for (uint32_t c : chars)
result.include(c);
} else {
result.include_all();
for (uint32_t c : chars)
result.exclude(c);
}
return result.copy();
}
rule_ptr i_sym(size_t index) {
return make_shared<rules::Symbol>(index);
}
rule_ptr i_sym(size_t index) {
return make_shared<rules::Symbol>(index);
}
rule_ptr i_aux_sym(size_t index) {
return make_shared<rules::Symbol>(index, SymbolOptionAuxiliary);
}
rule_ptr i_token(size_t index) {
return make_shared<rules::Symbol>(index, true);
}
rule_ptr i_token(size_t index) {
return make_shared<rules::Symbol>(index, SymbolOptionToken);
}
rule_ptr metadata(rule_ptr rule, map<rules::MetadataKey, int> values) {
return make_shared<rules::Metadata>(rule, values);
}
rule_ptr i_aux_token(size_t index) {
return make_shared<rules::Symbol>(index, SymbolOption(SymbolOptionAuxiliary|SymbolOptionToken));
}
rule_ptr metadata(rule_ptr rule, map<MetadataKey, int> values) {
return make_shared<Metadata>(rule, values);
}
bool operator==(const RuleEntry &left, const RuleEntry &right) {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type;
}
}

View file

@ -4,17 +4,16 @@
#include "tree_sitter/compiler.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/metadata.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {
namespace rules {
rule_ptr metadata(rule_ptr, std::map<MetadataKey, int>);
rule_ptr character(const std::set<uint32_t> &);
rule_ptr character(const std::set<uint32_t> &, bool sign);
rule_ptr i_sym(size_t index);
rule_ptr i_aux_sym(size_t index);
rule_ptr i_token(size_t index);
rule_ptr i_aux_token(size_t index);
}
rule_ptr metadata(rule_ptr, std::map<rules::MetadataKey, int>);
rule_ptr character(const std::set<uint32_t> &);
rule_ptr character(const std::set<uint32_t> &, bool sign);
rule_ptr i_sym(size_t index);
rule_ptr i_token(size_t index);
bool operator==(const RuleEntry &left, const RuleEntry &right);
}
#endif

View file

@ -7,6 +7,7 @@
#include <map>
#include <unordered_set>
#include <vector>
#include "compiler/prepared_grammar.h"
using std::cout;
@ -83,4 +84,16 @@ inline std::ostream& operator<<(std::ostream &stream, const std::pair<T1, T2> &p
} // namespace std
namespace tree_sitter {
using std::ostream;
using std::string;
using std::to_string;
inline ostream &operator<<(ostream &stream, const RuleEntry &entry) {
return stream << string("{") << entry.name << string(", ") << entry.rule << string(", ") << to_string(entry.type) << string("}");
}
}
#endif

View file

@ -1,5 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/expand_repeats.h"
#include "compiler/helpers/containers.h"
@ -11,131 +11,223 @@ using prepare_grammar::expand_repeats;
describe("expand_repeats", []() {
it("replaces repeat rules with pairs of recursive rules", [&]() {
SyntaxGrammar grammar{{
{ "rule0", repeat(i_token(0)) },
}, {}, {}, {}};
{
"rule0",
repeat(i_token(0)),
RuleEntryTypeNamed,
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", choice({ i_aux_sym(0), blank() }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(0),
choice({ i_aux_sym(0), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
choice({ i_sym(1), blank() }),
RuleEntryTypeNamed,
},
{
"rule0_repeat1",
seq({
i_token(0),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeHidden
},
})));
});
it("replaces repeats inside of sequences", [&]() {
SyntaxGrammar grammar{{
{ "rule0", seq({
i_token(10),
repeat(i_token(11)) }) },
}, {}, {}, {}};
{
"rule0",
seq({
i_token(10),
repeat(i_token(11)),
}),
RuleEntryTypeNamed,
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", seq({
i_token(10),
choice({ i_aux_sym(0), blank() }) }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(11),
choice({ i_aux_sym(0), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
seq({
i_token(10),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(11),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeHidden
},
})));
});
it("replaces repeats inside of choices", [&]() {
SyntaxGrammar grammar{{
{ "rule0", choice({ i_token(10), repeat(i_token(11)) }) },
}, {}, {}, {}};
{
"rule0",
choice({ i_token(10), repeat(i_token(11)) }),
RuleEntryTypeNamed
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", choice({ i_token(10), i_aux_sym(0), blank() }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(11),
choice({ i_aux_sym(0), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
choice({ i_token(10), i_sym(1), blank() }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(11),
choice({ i_sym(1), blank() }),
}),
RuleEntryTypeHidden
},
})));
});
it("does not create redundant auxiliary rules", [&]() {
SyntaxGrammar grammar{{
{ "rule0", choice({
seq({ i_token(1), repeat(i_token(4)) }),
seq({ i_token(2), repeat(i_token(4)) }) }) },
{ "rule1", seq({ i_token(3), repeat(i_token(4)) }) },
}, {}, {}, {}};
{
"rule0",
choice({
seq({ i_token(1), repeat(i_token(4)) }),
seq({ i_token(2), repeat(i_token(4)) }),
}),
RuleEntryTypeNamed
},
{
"rule1",
seq({ i_token(3), repeat(i_token(4)) }),
RuleEntryTypeNamed
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", choice({
seq({ i_token(1), choice({ i_aux_sym(0), blank() }) }),
seq({ i_token(2), choice({ i_aux_sym(0), blank() }) }) }) },
{ "rule1", seq({ i_token(3), choice({ i_aux_sym(0), blank() }) }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(4),
choice({ i_aux_sym(0), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
choice({
seq({ i_token(1), choice({ i_sym(2), blank() }) }),
seq({ i_token(2), choice({ i_sym(2), blank() }) }),
}),
RuleEntryTypeNamed
},
{
"rule1",
seq({ i_token(3), choice({ i_sym(2), blank() }) }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(4),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeHidden
},
})));
});
it("can replace multiple repeats in the same rule", [&]() {
SyntaxGrammar grammar{{
{ "rule0", seq({
repeat(i_token(10)),
repeat(i_token(11)) }) },
}, {}, {}, {}};
{
"rule0",
seq({
repeat(i_token(10)),
repeat(i_token(11)),
}),
RuleEntryTypeNamed
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", seq({
choice({ i_aux_sym(0), blank() }),
choice({ i_aux_sym(1), blank() }) }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(10),
choice({ i_aux_sym(0), blank() }) }) },
{ "rule0_repeat1", seq({
i_token(11),
choice({ i_aux_sym(1), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
seq({
choice({ i_sym(1), blank() }),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(10),
choice({ i_sym(1), blank() }),
}),
RuleEntryTypeHidden
},
{
"rule0_repeat2",
seq({
i_token(11),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeHidden
},
})));
});
it("can replace repeats in multiple rules", [&]() {
SyntaxGrammar grammar{{
{ "rule0", repeat(i_token(10)) },
{ "rule1", repeat(i_token(11)) },
}, {}, {}, {}};
{
"rule0",
repeat(i_token(10)),
RuleEntryTypeNamed,
},
{
"rule1",
repeat(i_token(11)),
RuleEntryTypeNamed,
},
}, {}, {}};
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", choice({ i_aux_sym(0), blank() }) },
{ "rule1", choice({ i_aux_sym(1), blank() }) },
})));
AssertThat(match.aux_rules, Equals(rule_list({
{ "rule0_repeat0", seq({
i_token(10),
choice({ i_aux_sym(0), blank() }) }) },
{ "rule1_repeat0", seq({
i_token(11),
choice({ i_aux_sym(1), blank() }) }) },
AssertThat(match.rules, Equals(eq_vector<RuleEntry>({
{
"rule0",
choice({ i_sym(2), blank() }),
RuleEntryTypeNamed
},
{
"rule1",
choice({ i_sym(3), blank() }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(10),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeHidden
},
{
"rule1_repeat1",
seq({
i_token(11),
choice({ i_sym(3), blank() })
}),
RuleEntryTypeHidden
},
})));
});
});

View file

@ -1,5 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/helpers/containers.h"
#include "compiler/prepare_grammar/expand_tokens.h"
@ -12,36 +12,64 @@ describe("expand_tokens", []() {
describe("string rules", [&]() {
it("replaces strings with sequences of character sets", [&]() {
LexicalGrammar grammar{{
{ "rule_A", seq({
i_sym(10),
str("xyz"),
i_sym(11) }) },
}, {}, {}};
{
"rule_A",
seq({
i_sym(10),
str("xyz"),
i_sym(11),
}),
RuleEntryTypeNamed
},
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
token(prec(1, seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }))),
i_sym(11) }) },
AssertThat(result.first.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
}),
i_sym(11),
}),
RuleEntryTypeNamed
},
})));
});
it("handles strings containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
// α β
{ "rule_A", str("\u03B1 \u03B2") },
}, {}, {}};
{
"rule_A",
str("\u03B1 \u03B2"), // α β
RuleEntryTypeNamed
},
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", token(prec(1, seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }) }))) }
AssertThat(result.first.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
}),
RuleEntryTypeNamed
}
})));
});
});
@ -49,43 +77,65 @@ describe("expand_tokens", []() {
describe("regexp rules", [&]() {
it("replaces regexps with the equivalent rule tree", [&]() {
LexicalGrammar grammar{{
{ "rule_A", seq({
i_sym(10),
pattern("x*"),
i_sym(11) }) },
}, {}, {}};
{
"rule_A",
seq({
i_sym(10),
pattern("x*"),
i_sym(11),
}),
RuleEntryTypeNamed
},
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11) }) },
AssertThat(result.first.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
}),
RuleEntryTypeNamed
},
})));
});
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
// [^α-δ]
{ "rule_A", pattern("[^\u03B1-\u03B4]*") },
}, {}, {}};
{
"rule_A",
pattern("[^\u03B1-\u03B4]*"), // [^α-δ]
RuleEntryTypeNamed
},
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) }
AssertThat(result.first.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
repeat(character({ 945, 946, 947, 948 }, false)),
RuleEntryTypeNamed
}
})));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar{{
{ "rule_A", seq({
pattern("("),
str("xyz"),
pattern("[") }) },
}, {}, {}};
{
"rule_A",
seq({
pattern("("),
str("xyz"),
pattern("["),
}),
RuleEntryTypeNamed
},
}, {}};
auto result = expand_tokens(grammar);

View file

@ -1,6 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/helpers/containers.h"
@ -12,271 +11,301 @@ using prepare_grammar::extract_tokens;
using prepare_grammar::InternedGrammar;
describe("extract_tokens", []() {
it("moves string rules into the lexical grammar", [&]() {
it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ str("ab"), i_sym(0) }) }
{
"rule_A",
repeat(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat(choice({ str("ef"), str("gh") }))),
}),
})),
},
{
"rule_B",
pattern("ij+"),
},
{
"rule_C",
choice({ str("kl"), blank() }),
},
{
"rule_D",
repeat(i_sym(3))
}
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
SyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
const GrammarError *error = get<2>(result);
AssertThat(error, Equals<const GrammarError *>(nullptr));
AssertThat(syntax_grammar.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
repeat(seq({
// This string is now the first token in the lexical grammar.
i_token(0),
// This pattern is now the second rule in the lexical grammar.
i_token(1),
choice({
// Rule 1, which this symbol pointed to, has been moved to the
// lexical grammar.
i_token(3),
// This symbol's index has been decremented, because a previous rule
// was moved to the lexical grammar.
i_sym(1),
// This token rule is now the third rule in the lexical grammar.
i_token(2),
}),
})),
RuleEntryTypeNamed,
},
{
"rule_C",
choice({ i_token(4), blank() }),
RuleEntryTypeNamed,
},
{
"rule_D",
repeat(i_sym(2)),
RuleEntryTypeNamed,
}
})));
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "'ab'", str("ab") },
AssertThat(lexical_grammar.rules, Equals(eq_vector<RuleEntry>({
// Strings become anonymous rules.
{
"ab",
str("ab"),
RuleEntryTypeAnonymous,
},
// Patterns become hidden rules.
{
"/cd*/",
pattern("cd*"),
RuleEntryTypeHidden,
},
// Rules marked as tokens become hidden rules.
{
"/(ef|gh)*/",
repeat(choice({ str("ef"), str("gh") })),
RuleEntryTypeHidden,
},
// This named rule was moved wholesale to the lexical grammar.
{
"rule_B",
pattern("ij+"),
RuleEntryTypeNamed,
},
// Strings become anonymous rules.
{
"kl",
str("kl"),
RuleEntryTypeAnonymous,
},
})));
});
it("moves pattern rules into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ pattern("a+"), i_sym(0) }) }
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "/a+/", pattern("a+") },
})));
});
it("moves other rules marked as tokens into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({
token(seq({ pattern("."), choice({ str("a"), str("b") }) })),
i_sym(0) }) }
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "(seq /./ (choice 'a' 'b'))", token(seq({ pattern("."), choice({ str("a"), str("b") }) })) },
})));
});
it("does not move blank rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", choice({ i_sym(0), blank() }) },
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", choice({ i_sym(0), blank() }) },
})));
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, IsEmpty())
});
it("does not create duplicate tokens in the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
{
"rule_A",
seq({
str("ab"),
i_sym(0),
str("ab"),
})
},
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
})));
AssertThat(get<0>(result).aux_rules, IsEmpty())
SyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "'ab'", str("ab") },
AssertThat(syntax_grammar.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
seq({ i_token(0), i_sym(0), i_token(0) }),
RuleEntryTypeNamed
}
})));
AssertThat(lexical_grammar.rules, Equals(eq_vector<RuleEntry>({
{
"ab",
str("ab"),
RuleEntryTypeAnonymous
},
})))
});
it("updates the grammar's expected conflict symbols", [&]() {
auto result = extract_tokens(InternedGrammar{
it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
{ "rule_A", str("ok") },
{ "rule_B", repeat(i_sym(0)) },
{ "rule_C", repeat(seq({ i_sym(0), i_sym(0) })) },
"rule_A",
seq({ i_sym(1), str("ab") })
},
{ str(" ") },
{ { Symbol(1), Symbol(2) } }
});
{
"rule_B",
str("cd")
},
{
"rule_C",
seq({ str("ef"), str("cd") })
},
}, {}, {}});
AssertThat(get<0>(result).rules.size(), Equals<size_t>(2));
AssertThat(get<1>(result).rules.size(), Equals<size_t>(1));
AssertThat(get<0>(result).expected_conflicts, Equals(set<set<Symbol>>({
SyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.rules, Equals(eq_vector<RuleEntry>({
{
"rule_A",
seq({ i_sym(1), i_token(0) }),
RuleEntryTypeNamed
},
{
"rule_B",
i_token(1),
RuleEntryTypeNamed
},
{
"rule_C",
seq({ i_token(2), i_token(1) }),
RuleEntryTypeNamed
},
})));
AssertThat(lexical_grammar.rules, Equals(eq_vector<RuleEntry>({
{
"ab",
str("ab"),
RuleEntryTypeAnonymous
},
{
"cd",
str("cd"),
RuleEntryTypeAnonymous
},
{
"ef",
str("ef"),
RuleEntryTypeAnonymous
},
})));
});
it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
str("ok")
},
{
"rule_B",
repeat(i_sym(0))
},
{
"rule_C",
repeat(seq({ i_sym(0), i_sym(0) }))
},
}, { str(" ") }, { { Symbol(1), Symbol(2) } }});
SyntaxGrammar &syntax_grammar = get<0>(result);
AssertThat(syntax_grammar.rules.size(), Equals<size_t>(2));
AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
{ Symbol(0), Symbol(1) },
})));
});
describe("when an entire grammar rule is a token", [&]() {
it("moves the rule the lexical grammar and updates referencing symbols", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", i_sym(1) },
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", i_token(0) }
})));
AssertThat(get<0>(result).aux_rules, IsEmpty());
AssertThat(get<1>(result).rules, Equals(rule_list({
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
})));
// TODO put back
// AssertThat(get<1>(result).aux_rules, IsEmpty());
});
it("updates symbols whose indices need to change due to deleted rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_B", i_token(0) },
{ "rule_C", i_sym(0) },
})));
AssertThat(get<0>(result).aux_rules, IsEmpty());
AssertThat(get<1>(result).rules, Equals(rule_list({
{ "rule_A", str("ab") },
})));
// TODO put back
// AssertThat(get<1>(result).aux_rules, IsEmpty());
});
it("does not move the rule if its content is used elsewhere in the grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ i_sym(1), str("ab") }) },
{ "rule_B", str("cd") },
{ "rule_C", seq({ str("ef"), str("cd") }) },
}, {}, {}});
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_sym(1), i_aux_token(0) }) },
{ "rule_B", i_aux_token(1) },
{ "rule_C", seq({ i_aux_token(2), i_aux_token(1) }) },
})));
AssertThat(get<0>(result).aux_rules, IsEmpty());
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "'ab'", str("ab") },
{ "'cd'", str("cd") },
{ "'ef'", str("ef") },
})));
});
});
describe("handling ubiquitous tokens", [&]() {
describe("ubiquitous tokens that are not symbols", [&]() {
it("adds them to the lexical grammar's separators", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", str("x") },
}, {
pattern("\\s+"),
str("y"),
}, {}});
it("adds inline ubiquitous tokens to the lexical grammar's separators", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", str("x") },
}, {
pattern("\\s+"),
str("y"),
}, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<1>(result).separators, Equals(rule_vector({
pattern("\\s+"),
str("y"),
})));
AssertThat(get<1>(result).separators, Equals(rule_vector({
pattern("\\s+"),
str("y"),
})));
AssertThat(get<0>(result).ubiquitous_tokens, IsEmpty());
});
AssertThat(get<0>(result).ubiquitous_tokens, IsEmpty());
});
describe("ubiquitous tokens that point to moved rules", [&]() {
it("updates them according to the new symbol numbers", [&]() {
auto result = extract_tokens(InternedGrammar{ {
{ "rule_A", seq({ str("w"), i_sym(1) }) },
{ "rule_B", str("x") },
{ "rule_C", str("y") },
}, {
i_sym(2),
}, {}});
it("updates ubiquitous symbols according to the new symbol numbers", [&]() {
auto result = extract_tokens(InternedGrammar{ {
{ "rule_A", seq({ str("w"), str("x"), i_sym(1) }) },
{ "rule_B", str("y") },
{ "rule_C", str("z") },
}, {
i_sym(2),
}, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
{ Symbol(1, SymbolOptionToken) },
})));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
{ Symbol(3, true) },
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
AssertThat(get<1>(result).separators, IsEmpty());
});
describe("ubiquitous tokens that are visible", [&]() {
it("preserves them in the syntactic grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", str("ab") },
{ "rule_B", str("bc") },
}, { i_sym(1) }, {}});
it("returns an error if any ubiquitous tokens are non-token symbols", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
seq({ str("x"), i_sym(1) }),
},
{
"rule_B",
seq({ str("y"), str("z") })
},
}, { i_sym(1) }, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
Symbol(1, SymbolOptionToken)
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: rule_B")));
});
describe("ubiquitous tokens that are used in other grammar rules", [&]() {
it("preserves them in the syntactic grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ i_sym(1), str("ab") }) },
{ "_rule_B", str("bc") },
}, { i_sym(1) }, {}});
it("returns an error if any ubiquitous tokens are non-token rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
str("x")
},
{
"rule_B",
str("y")
},
}, { choice({ i_sym(1), blank() }) }, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
Symbol(0, SymbolOptionToken),
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
});
describe("ubiquitous tokens that are non-token symbols", [&]() {
it("returns an error", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", seq({ str("x"), i_sym(1) }), },
{ "rule_B", seq({ str("y"), str("z") }) },
}, { i_sym(1) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: rule_B")));
});
});
describe("ubiquitous tokens that are not symbols", [&]() {
it("returns an error", [&]() {
auto result = extract_tokens(InternedGrammar{{
{ "rule_A", str("x") },
{ "rule_B", str("y") },
}, { choice({ i_sym(1), blank() }) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: (choice (sym 1) (blank))")));
});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: (choice (sym 1) (blank))")));
});
});
});