Merge branch 'flatten-rules-into-productions'

This branch had diverged considerably, so merging it required changing a lot
of code.

Conflicts:
	project.gyp
	spec/compiler/build_tables/action_takes_precedence_spec.cc
	spec/compiler/build_tables/build_conflict_spec.cc
	spec/compiler/build_tables/build_parse_table_spec.cc
	spec/compiler/build_tables/first_symbols_spec.cc
	spec/compiler/build_tables/item_set_closure_spec.cc
	spec/compiler/build_tables/item_set_transitions_spec.cc
	spec/compiler/build_tables/rule_can_be_blank_spec.cc
	spec/compiler/helpers/containers.h
	spec/compiler/prepare_grammar/expand_repeats_spec.cc
	spec/compiler/prepare_grammar/extract_tokens_spec.cc
	src/compiler/build_tables/action_takes_precedence.h
	src/compiler/build_tables/build_parse_table.cc
	src/compiler/build_tables/first_symbols.cc
	src/compiler/build_tables/first_symbols.h
	src/compiler/build_tables/item_set_closure.cc
	src/compiler/build_tables/item_set_transitions.cc
	src/compiler/build_tables/parse_item.cc
	src/compiler/build_tables/parse_item.h
	src/compiler/build_tables/rule_can_be_blank.cc
	src/compiler/build_tables/rule_can_be_blank.h
	src/compiler/prepare_grammar/expand_repeats.cc
	src/compiler/prepare_grammar/extract_tokens.cc
	src/compiler/prepare_grammar/extract_tokens.h
	src/compiler/prepare_grammar/prepare_grammar.cc
	src/compiler/rules/built_in_symbols.cc
	src/compiler/rules/built_in_symbols.h
	src/compiler/syntax_grammar.cc
	src/compiler/syntax_grammar.h
This commit is contained in:
Max Brunsfeld 2015-10-01 17:10:39 -07:00
commit ebc52f109d
71 changed files with 30354 additions and 33188 deletions

View file

@ -13,7 +13,6 @@
'src/compiler/build_tables/build_lex_table.cc',
'src/compiler/build_tables/build_parse_table.cc',
'src/compiler/build_tables/build_tables.cc',
'src/compiler/build_tables/first_symbols.cc',
'src/compiler/build_tables/get_completion_status.cc',
'src/compiler/build_tables/get_metadata.cc',
'src/compiler/build_tables/item.cc',
@ -32,13 +31,17 @@
'src/compiler/parse_table.cc',
'src/compiler/prepare_grammar/expand_repeats.cc',
'src/compiler/prepare_grammar/expand_tokens.cc',
'src/compiler/prepare_grammar/extract_choices.cc',
'src/compiler/prepare_grammar/extract_tokens.cc',
'src/compiler/prepare_grammar/flatten_grammar.cc',
'src/compiler/prepare_grammar/intern_symbols.cc',
'src/compiler/prepare_grammar/is_token.cc',
'src/compiler/prepare_grammar/parse_regex.cc',
'src/compiler/prepare_grammar/prepare_grammar.cc',
'src/compiler/prepare_grammar/token_description.cc',
'src/compiler/rule.cc',
'src/compiler/syntax_grammar.cc',
'src/compiler/variable.cc',
'src/compiler/rules/blank.cc',
'src/compiler/rules/built_in_symbols.cc',
'src/compiler/rules/character_range.cc',

View file

@ -1,117 +0,0 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/build_tables/first_symbols.h"
#include "compiler/rules/metadata.h"
using namespace build_tables;
using namespace rules;
START_TEST
describe("first_symbols", []() {
SyntaxGrammar null_grammar;
describe("for a sequence AB", [&]() {
it("ignores B when A cannot be blank", [&]() {
auto rule = seq({ i_token(0), i_token(1) });
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(0, true),
})));
});
it("includes first_symbols(B) when A can be blank", [&]() {
auto rule = seq({
choice({
i_token(0),
blank() }),
i_token(1) });
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(0, true),
Symbol(1, true)
})));
});
it("includes first_symbols(A's right hand side) when A is a non-terminal", [&]() {
auto rule = choice({
seq({
i_token(0),
i_token(1) }),
i_sym(0) });
SyntaxGrammar grammar{{
{
"rule0",
seq({
i_token(2),
i_token(3),
i_token(4),
}),
RuleEntryTypeNamed
}
}, {}, {}};
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(0, true),
Symbol(2, true),
})));
});
it("includes first_symbols(B) when A is a non-terminal and its expansion can be blank", [&]() {
auto rule = seq({
i_sym(0),
i_token(1) });
SyntaxGrammar grammar{{
{
"rule0",
choice({
i_token(0),
blank(),
}),
RuleEntryTypeNamed
},
}, {}, {}};
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(0, true),
Symbol(1, true),
})));
});
});
describe("when there are left-recursive rules", [&]() {
it("terminates", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
choice({
seq({ i_sym(0), i_token(10) }),
i_token(11),
}),
RuleEntryTypeNamed
},
}, {}, {}};
auto rule = i_sym(0);
AssertThat(first_symbols(rule, grammar), Equals(set<Symbol>({
Symbol(0),
Symbol(11, true)
})));
});
});
it("ignores metadata rules", [&]() {
auto rule = make_shared<Metadata>(i_token(3), map<rules::MetadataKey, int>());
AssertThat(first_symbols(rule, null_grammar), Equals(set<Symbol>({
Symbol(3, true),
})));
});
});
END_TEST

View file

@ -1,7 +1,8 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/build_tables/item_set_closure.h"
#include "compiler/build_tables/item_set_transitions.h"
#include "compiler/rules/built_in_symbols.h"
using namespace build_tables;
using namespace rules;
@ -10,39 +11,55 @@ START_TEST
describe("item_set_closure", []() {
SyntaxGrammar grammar{{
{
"E",
seq({
i_sym(1),
i_token(11),
SyntaxVariable("rule0", VariableTypeNamed, {
Production({
{Symbol(1), 0, AssociativityNone, 100},
{Symbol(11, true), 0, AssociativityNone, 101},
}),
RuleEntryTypeNamed,
},
{
"T",
seq({
i_token(12),
i_token(13),
}),
SyntaxVariable("rule1", VariableTypeNamed, {
Production({
{Symbol(12, true), 0, AssociativityNone, 102},
{Symbol(13, true), 0, AssociativityNone, 103},
}),
RuleEntryTypeNamed,
},
Production({
{Symbol(2), 0, AssociativityNone, 104},
})
}),
SyntaxVariable("rule2", VariableTypeNamed, {
Production({
{Symbol(14, true), 0, AssociativityNone, 105},
{Symbol(15, true), 0, AssociativityNone, 106},
})
}),
}, {}, {}};
it("adds items at the beginnings of referenced rules", [&]() {
ParseItemSet item_set = item_set_closure(
ParseItem(Symbol(0), grammar.rules[0].rule, {}),
set<Symbol>({ Symbol(10, true) }),
grammar
);
ParseItemSet item_set({
{
ParseItem(Symbol(0), 0, 0, 100),
set<Symbol>({ Symbol(10, true) }),
}
});
item_set_closure(&item_set, grammar);
AssertThat(item_set, Equals(ParseItemSet({
{
ParseItem(Symbol(1), grammar.rules[1].rule, {}),
set<Symbol>({ Symbol(11, true) }),
ParseItem(Symbol(0), 0, 0, 100),
set<Symbol>({ Symbol(10, true) })
},
{
ParseItem(Symbol(0), grammar.rules[0].rule, {}),
set<Symbol>({ Symbol(10, true) }),
ParseItem(Symbol(1), 0, 0, 102),
set<Symbol>({ Symbol(11, true) })
},
{
ParseItem(Symbol(1), 1, 0, 104),
set<Symbol>({ Symbol(11, true) })
},
{
ParseItem(Symbol(2), 0, 0, 105),
set<Symbol>({ Symbol(11, true) })
},
})));
});

View file

@ -1,6 +1,6 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/item_set_transitions.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/helpers/rule_helpers.h"
using namespace rules;
@ -17,63 +17,67 @@ describe("char_transitions(LexItemSet)", []() {
});
AssertThat(char_transitions(set1), Equals(map<CharacterSet, LexItemSet>({
{
CharacterSet().include('a', 'd'),
LexItemSet({
LexItem(Symbol(1), blank()),
})
},
{
CharacterSet().include('e', 'f'),
LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(2), blank()),
})
},
{
{
CharacterSet().include('a', 'd'),
LexItemSet({
LexItem(Symbol(1), blank()),
})
},
{
CharacterSet().include('e', 'f'),
LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(2), blank()),
})
},
{
CharacterSet().include('g', 'x'),
LexItemSet({
LexItem(Symbol(2), blank()),
})
},
LexItemSet({
LexItem(Symbol(2), blank()),
})
},
})));
});
});
});
describe("sym_transitions(ParseItemSet, SyntaxGrammar)", [&]() {
SyntaxGrammar grammar{{
{
"A",
blank(),
RuleEntryTypeNamed
},
{
"B",
i_token(21),
RuleEntryTypeNamed
},
}, {}, {}};
describe("sym_transitions(ParseItemSet, InitialSyntaxGrammar)", [&]() {
it("computes the closure of the new item sets", [&]() {
SyntaxGrammar grammar{{
SyntaxVariable("A", VariableTypeNamed, {
Production({
{Symbol(11, true), 0, AssociativityNone, 101},
{Symbol(12, true), 0, AssociativityNone, 102},
{Symbol(13, true), 0, AssociativityNone, 103},
{Symbol(1), 0, AssociativityNone, 104},
{Symbol(14, true), 0, AssociativityNone, 105},
})
}),
SyntaxVariable("B", VariableTypeNamed, {
Production({
{Symbol(15, true), 0, AssociativityNone, 106},
})
})
}, {}, {}};
ParseItemSet set1({
{
ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), { Symbol(101) }),
set<Symbol>({ Symbol(23, true) })
},
ParseItem(Symbol(0), 0, 2, 103),
set<Symbol>({ Symbol(16, true) })
}
});
AssertThat(sym_transitions(set1, grammar), Equals(map<Symbol, ParseItemSet>({
{
Symbol(22, true),
Symbol(13, true),
ParseItemSet({
{
ParseItem(Symbol(0), i_sym(1), { Symbol(101), Symbol(22) }),
set<Symbol>({ Symbol(23, true) }),
ParseItem(Symbol(0), 0, 3, 104),
set<Symbol>({ Symbol(16, true) })
},
{
ParseItem(Symbol(1), i_token(21), {}),
set<Symbol>({ Symbol(23, true) })
ParseItem(Symbol(1), 0, 0, 106),
set<Symbol>({ Symbol(14, true) })
},
})
},

View file

@ -2,7 +2,6 @@
#include "compiler/rules/built_in_symbols.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using namespace build_tables;
@ -11,16 +10,8 @@ START_TEST
describe("LexConflictManager", []() {
LexicalGrammar lexical_grammar{{
{
"other_token",
pattern("[a-b]"),
RuleEntryTypeNamed
},
{
"lookahead_token",
pattern("[c-d]"),
RuleEntryTypeNamed
},
Variable("other_token", VariableTypeNamed, pattern("[a-b]")),
Variable("lookahead_token", VariableTypeNamed, pattern("[c-d]"))
}, {}};
LexConflictManager conflict_manager(lexical_grammar);

View file

@ -2,7 +2,6 @@
#include "compiler/rules/built_in_symbols.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/parse_conflict_manager.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using namespace build_tables;
@ -11,31 +10,11 @@ START_TEST
describe("ParseConflictManager", []() {
SyntaxGrammar syntax_grammar{{
{
"in_progress_rule1",
i_token(0),
RuleEntryTypeNamed,
},
{
"in_progress_rule2",
i_token(0),
RuleEntryTypeNamed,
},
{
"reduced_rule",
i_token(0),
RuleEntryTypeNamed,
},
{
"other_rule1",
i_token(0),
RuleEntryTypeNamed,
},
{
"other_rule2",
i_token(0),
RuleEntryTypeNamed,
},
SyntaxVariable("in_progress_rule1", VariableTypeNamed, { Production() }),
SyntaxVariable("in_progress_rule2", VariableTypeNamed, { Production() }),
SyntaxVariable("reduced_rule", VariableTypeNamed, { Production() }),
SyntaxVariable("other_rule1", VariableTypeNamed, { Production() }),
SyntaxVariable("other_rule2", VariableTypeNamed, { Production() }),
}, { Symbol(2, true) }, {}};
pair<bool, ConflictType> result;

View file

@ -1,7 +1,6 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/rules/metadata.h"
#include "compiler/prepared_grammar.h"
using namespace rules;
using build_tables::rule_can_be_blank;
@ -54,37 +53,6 @@ describe("rule_can_be_blank", [&]() {
rule = make_shared<rules::Metadata>(sym("one"), map<rules::MetadataKey, int>());
AssertThat(rule_can_be_blank(rule), IsFalse());
});
describe("checking recursively (by expanding non-terminals)", [&]() {
SyntaxGrammar grammar{{
{
"A",
choice({
seq({ i_sym(0), i_token(11) }),
blank()
}),
RuleEntryTypeNamed,
},
{
"B",
choice({
seq({ i_sym(1), i_token(12) }),
i_token(13)
}),
RuleEntryTypeNamed,
},
}, {}, {}};
it("terminates for left-recursive rules that can be blank", [&]() {
rule = i_sym(0);
AssertThat(rule_can_be_blank(rule, grammar), IsTrue());
});
it("terminates for left-recursive rules that can't be blank", [&]() {
rule = i_sym(1);
AssertThat(rule_can_be_blank(rule, grammar), IsFalse());
});
});
});
END_TEST

View file

@ -29,8 +29,9 @@ describe("compiling the example grammars", []() {
string code = result.first;
const GrammarError *error = result.second;
AssertThat(error, Equals((GrammarError *)nullptr));
if (error)
AssertThat(error->message, Equals(""));
ofstream file(example_parser_dir + language + ".c");
file << get<0>(result);
file.close();

View file

@ -39,7 +39,7 @@ namespace tree_sitter {
return make_shared<rules::Metadata>(rule, values);
}
bool operator==(const RuleEntry &left, const RuleEntry &right) {
bool operator==(const Variable &left, const Variable &right) {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type;
}

View file

@ -4,7 +4,7 @@
#include "tree_sitter/compiler.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/metadata.h"
#include "compiler/prepared_grammar.h"
#include "compiler/variable.h"
namespace tree_sitter {
rule_ptr metadata(rule_ptr, std::map<rules::MetadataKey, int>);
@ -13,7 +13,7 @@ namespace tree_sitter {
rule_ptr i_sym(size_t index);
rule_ptr i_token(size_t index);
bool operator==(const RuleEntry &left, const RuleEntry &right);
bool operator==(const Variable &left, const Variable &right);
}
#endif

View file

@ -2,6 +2,7 @@
#include "compiler/compiler_spec_helper.h"
#include "tree_sitter/compiler.h"
#include "compiler/parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/build_tables/get_metadata.h"
@ -42,8 +43,12 @@ ostream &operator<<(ostream &stream, const rule_ptr &rule) {
return stream;
}
ostream &operator<<(ostream &stream, const RuleEntry &entry) {
return stream << string("{") << entry.name << string(", ") << entry.rule << string(", ") << to_string(entry.type) << string("}");
ostream &operator<<(ostream &stream, const Variable &variable) {
return stream << string("{") << variable.name << string(", ") << variable.rule << string(", ") << to_string(variable.type) << string("}");
}
ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
}
std::ostream &operator<<(std::ostream &stream, const LexAction &action) {
@ -100,6 +105,10 @@ ostream &operator<<(ostream &stream, const ParseState &state) {
return stream;
}
ostream &operator<<(ostream &stream, const ProductionStep &step) {
return stream << string("(production_step symbol:") << step.symbol << string(" precedence:") << to_string(step.precedence) << ")";
}
namespace build_tables {
ostream &operator<<(ostream &stream, const build_tables::LexItem &item) {
@ -107,8 +116,11 @@ ostream &operator<<(ostream &stream, const build_tables::LexItem &item) {
<< string(")");
}
ostream &operator<<(ostream &stream, const build_tables::ParseItem &item) {
return stream << string("(item ") << item.lhs << string(" ") << *item.rule
ostream &operator<<(ostream &stream, const ParseItem &item) {
return stream << string("(item variable:") << to_string(item.variable_index)
<< string(" production:") << to_string(item.production_index)
<< string(" step:") << to_string(item.step_index)
<< string(" remaining_rule:") << to_string(item.rule_id)
<< string(")");
}

View file

@ -37,8 +37,8 @@ inline std::ostream& operator<<(std::ostream &stream, const std::set<T> &set) {
return stream << ")";
}
template<typename T>
inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set<T> &set) {
template<typename T, typename H, typename E>
inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set<T, H, E> &set) {
stream << std::string("(set: ");
bool started = false;
for (auto item : set) {
@ -89,19 +89,23 @@ namespace tree_sitter {
using std::ostream;
using std::string;
using std::to_string;
struct RuleEntry;
struct Variable;
struct SyntaxVariable;
class LexAction;
class ParseAction;
class ParseState;
struct ProductionStep;
ostream &operator<<(ostream &, const Grammar &);
ostream &operator<<(ostream &, const GrammarError &);
ostream &operator<<(ostream &, const Rule &);
ostream &operator<<(ostream &, const rule_ptr &);
ostream &operator<<(ostream &, const RuleEntry &);
std::ostream &operator<<(ostream &stream, const LexAction &);
std::ostream &operator<<(ostream &stream, const ParseAction &);
std::ostream &operator<<(ostream &stream, const ParseState &);
ostream &operator<<(ostream &, const Variable &);
ostream &operator<<(ostream &, const SyntaxVariable &);
ostream &operator<<(ostream &, const LexAction &);
ostream &operator<<(ostream &, const ParseAction &);
ostream &operator<<(ostream &, const ParseState &);
ostream &operator<<(ostream &, const ProductionStep &);
namespace build_tables {
@ -109,9 +113,9 @@ struct MetadataRange;
class LexItem;
class ParseItem;
ostream &operator<<(ostream &stream, const MetadataRange &);
ostream &operator<<(ostream &stream, const LexItem &);
ostream &operator<<(ostream &stream, const ParseItem &);
ostream &operator<<(ostream &, const MetadataRange &);
ostream &operator<<(ostream &, const LexItem &);
ostream &operator<<(ostream &, const ParseItem &);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,232 +1,152 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/prepare_grammar/expand_repeats.h"
START_TEST
using namespace rules;
using prepare_grammar::InitialSyntaxGrammar;
using prepare_grammar::expand_repeats;
describe("expand_repeats", []() {
it("replaces repeat rules with pairs of recursive rules", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
repeat(i_token(0)),
RuleEntryTypeNamed,
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, repeat(i_token(0))),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
choice({ i_sym(1), blank() }),
RuleEntryTypeNamed,
},
{
"rule0_repeat1",
seq({
i_token(0),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeAuxiliary
},
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({ i_sym(1), blank() })),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(0),
choice({ i_sym(1), blank() })
})),
})));
});
it("replaces repeats inside of sequences", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
seq({
i_token(10),
repeat(i_token(11)),
}),
RuleEntryTypeNamed,
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, seq({
i_token(10),
repeat(i_token(11)),
})),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
seq({
i_token(10),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(11),
choice({ i_sym(1), blank() })
}),
RuleEntryTypeAuxiliary
},
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, seq({
i_token(10),
choice({ i_sym(1), blank() })
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(11),
choice({ i_sym(1), blank() })
})),
})));
});
it("replaces repeats inside of choices", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
choice({ i_token(10), repeat(i_token(11)) }),
RuleEntryTypeNamed
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, choice({
i_token(10),
repeat(i_token(11))
})),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
choice({ i_token(10), i_sym(1), blank() }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(11),
choice({ i_sym(1), blank() }),
}),
RuleEntryTypeAuxiliary
},
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({ i_token(10), i_sym(1), blank() })),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(11),
choice({ i_sym(1), blank() }),
})),
})));
});
it("does not create redundant auxiliary rules", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
choice({
seq({ i_token(1), repeat(i_token(4)) }),
seq({ i_token(2), repeat(i_token(4)) }),
}),
RuleEntryTypeNamed
},
{
"rule1",
seq({ i_token(3), repeat(i_token(4)) }),
RuleEntryTypeNamed
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, choice({
seq({ i_token(1), repeat(i_token(4)) }),
seq({ i_token(2), repeat(i_token(4)) }),
})),
Variable("rule1", VariableTypeNamed, seq({
i_token(3),
repeat(i_token(4))
})),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
choice({
seq({ i_token(1), choice({ i_sym(2), blank() }) }),
seq({ i_token(2), choice({ i_sym(2), blank() }) }),
}),
RuleEntryTypeNamed
},
{
"rule1",
seq({ i_token(3), choice({ i_sym(2), blank() }) }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(4),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeAuxiliary
},
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({
seq({ i_token(1), choice({ i_sym(2), blank() }) }),
seq({ i_token(2), choice({ i_sym(2), blank() }) }),
})),
Variable("rule1", VariableTypeNamed, seq({
i_token(3),
choice({ i_sym(2), blank() })
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(4),
choice({ i_sym(2), blank() }),
})),
})));
});
it("can replace multiple repeats in the same rule", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
seq({
repeat(i_token(10)),
repeat(i_token(11)),
}),
RuleEntryTypeNamed
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, seq({
repeat(i_token(10)),
repeat(i_token(11)),
})),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
seq({
choice({ i_sym(1), blank() }),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(10),
choice({ i_sym(1), blank() }),
}),
RuleEntryTypeAuxiliary
},
{
"rule0_repeat2",
seq({
i_token(11),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeAuxiliary
},
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, seq({
choice({ i_sym(1), blank() }),
choice({ i_sym(2), blank() }),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(10),
choice({ i_sym(1), blank() }),
})),
Variable("rule0_repeat2", VariableTypeAuxiliary, seq({
i_token(11),
choice({ i_sym(2), blank() }),
})),
})));
});
it("can replace repeats in multiple rules", [&]() {
SyntaxGrammar grammar{{
{
"rule0",
repeat(i_token(10)),
RuleEntryTypeNamed,
},
{
"rule1",
repeat(i_token(11)),
RuleEntryTypeNamed,
},
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, repeat(i_token(10))),
Variable("rule1", VariableTypeNamed, repeat(i_token(11))),
}, {}, {}};
auto match = expand_repeats(grammar);
auto result = expand_repeats(grammar);
AssertThat(match.rules, Equals(vector<RuleEntry>({
{
"rule0",
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({
i_sym(2),
blank(),
})),
Variable("rule1", VariableTypeNamed, choice({
i_sym(3),
blank(),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, seq({
i_token(10),
choice({ i_sym(2), blank() }),
RuleEntryTypeNamed
},
{
"rule1",
choice({ i_sym(3), blank() }),
RuleEntryTypeNamed
},
{
"rule0_repeat1",
seq({
i_token(10),
choice({ i_sym(2), blank() }),
}),
RuleEntryTypeAuxiliary
},
{
"rule1_repeat1",
seq({
i_token(11),
choice({ i_sym(3), blank() })
}),
RuleEntryTypeAuxiliary
},
})),
Variable("rule1_repeat1", VariableTypeAuxiliary, seq({
i_token(11),
choice({ i_sym(3), blank() })
})),
})));
});
});

View file

@ -1,5 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/expand_tokens.h"
START_TEST
@ -11,64 +11,48 @@ describe("expand_tokens", []() {
describe("string rules", [&]() {
it("replaces strings with sequences of character sets", [&]() {
LexicalGrammar grammar{{
{
"rule_A",
seq({
i_sym(10),
str("xyz"),
i_sym(11),
}),
RuleEntryTypeNamed
},
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
str("xyz"),
i_sym(11),
})),
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(vector<RuleEntry>({
{
"rule_A",
seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
}),
i_sym(11),
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
}),
RuleEntryTypeNamed
},
i_sym(11),
})),
})));
});
it("handles strings containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
{
"rule_A",
str("\u03B1 \u03B2"), // α β
RuleEntryTypeNamed
},
Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.first.rules, Equals(vector<RuleEntry>({
{
"rule_A",
metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
}),
RuleEntryTypeNamed
}
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), {
{PRECEDENCE, 1},
{IS_TOKEN, 1},
})),
})));
});
});
@ -76,64 +60,44 @@ describe("expand_tokens", []() {
describe("regexp rules", [&]() {
it("replaces regexps with the equivalent rule tree", [&]() {
LexicalGrammar grammar{{
{
"rule_A",
seq({
i_sym(10),
pattern("x*"),
i_sym(11),
}),
RuleEntryTypeNamed
},
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
pattern("x*"),
i_sym(11),
})),
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(vector<RuleEntry>({
{
"rule_A",
seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
}),
RuleEntryTypeNamed
},
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
})),
})));
});
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
{
"rule_A",
pattern("[^\u03B1-\u03B4]*"), // [^α-δ]
RuleEntryTypeNamed
},
Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
}, {}};
auto result = expand_tokens(grammar);
AssertThat(result.first.rules, Equals(vector<RuleEntry>({
{
"rule_A",
repeat(character({ 945, 946, 947, 948 }, false)),
RuleEntryTypeNamed
}
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
})));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar{{
{
"rule_A",
seq({
pattern("("),
str("xyz"),
pattern("["),
}),
RuleEntryTypeNamed
},
Variable("rule_A", VariableTypeNamed, seq({
pattern("("),
str("xyz"),
pattern("["),
}))
}, {}};
auto result = expand_tokens(grammar);

View file

@ -0,0 +1,74 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepare_grammar/extract_choices.h"
START_TEST
using namespace rules;
using prepare_grammar::extract_choices;
class rule_vector : public vector<rule_ptr> {
public:
bool operator==(const vector<rule_ptr> &other) const {
if (this->size() != other.size()) return false;
for (size_t i = 0; i < this->size(); i++) {
auto rule = this->operator[](i);
auto other_rule = other[i];
if (!rule->operator==(*rule))
return false;
}
return true;
}
rule_vector(const initializer_list<rule_ptr> &list) :
vector<rule_ptr>(list) {}
};
describe("extract_choices", []() {
it("expands rules containing choices into multiple rules", [&]() {
auto rule = seq({
sym("a"),
choice({ sym("b"), sym("c"), sym("d") }),
sym("e")
});
AssertThat(extract_choices(rule), Equals(rule_vector({
seq({ sym("a"), sym("b"), sym("e") }),
seq({ sym("a"), sym("c"), sym("e") }),
seq({ sym("a"), sym("d"), sym("e") }),
})));
});
it("handles metadata rules", [&]() {
auto rule = prec(5, choice({ sym("b"), sym("c"), sym("d") }));
AssertThat(extract_choices(rule), Equals(rule_vector({
prec(5, sym("b")),
prec(5, sym("c")),
prec(5, sym("d")),
})));
});
it("handles nested choices", [&]() {
auto rule = choice({
seq({ choice({ sym("a"), sym("b") }), sym("c") }),
sym("d")
});
AssertThat(extract_choices(rule), Equals(rule_vector({
seq({ sym("a"), sym("c") }),
seq({ sym("b"), sym("c") }),
sym("d"),
})));
});
it("handles repeats", [&]() {
auto rule = repeat(choice({ sym("a"), sym("b") }));
AssertThat(extract_choices(rule), Equals(rule_vector({
repeat(sym("a")),
repeat(sym("b")),
})));
});
});
END_TEST

View file

@ -1,6 +1,7 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/prepare_grammar/extract_tokens.h"
START_TEST
@ -8,238 +9,133 @@ START_TEST
using namespace rules;
using prepare_grammar::extract_tokens;
using prepare_grammar::InternedGrammar;
using prepare_grammar::InitialSyntaxGrammar;
describe("extract_tokens", []() {
it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
repeat(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat(choice({ str("ef"), str("gh") }))),
}),
})),
RuleEntryTypeNamed,
},
{
"rule_B",
pattern("ij+"),
RuleEntryTypeNamed,
},
{
"rule_C",
choice({ str("kl"), blank() }),
RuleEntryTypeNamed,
},
{
"rule_D",
repeat(i_sym(3)),
RuleEntryTypeNamed,
}
Variable("rule_A", VariableTypeNamed, repeat(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat(choice({ str("ef"), str("gh") }))),
}),
}))),
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
Variable("rule_D", VariableTypeNamed, repeat(i_sym(3)))
}, {}, {}});
SyntaxGrammar &syntax_grammar = get<0>(result);
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
const GrammarError *error = get<2>(result);
AssertThat(error, Equals<const GrammarError *>(nullptr));
AssertThat(syntax_grammar.rules, Equals(vector<RuleEntry>({
{
"rule_A",
repeat(seq({
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat(seq({
// This string is now the first token in the lexical grammar.
i_token(0),
// This string is now the first token in the lexical grammar.
i_token(0),
// This pattern is now the second rule in the lexical grammar.
i_token(1),
// This pattern is now the second rule in the lexical grammar.
i_token(1),
choice({
// Rule 1, which this symbol pointed to, has been moved to the
// lexical grammar.
i_token(3),
choice({
// Rule 1, which this symbol pointed to, has been moved to the
// lexical grammar.
i_token(3),
// This symbol's index has been decremented, because a previous rule
// was moved to the lexical grammar.
i_sym(1),
// This symbol's index has been decremented, because a previous rule
// was moved to the lexical grammar.
i_sym(1),
// This token rule is now the third rule in the lexical grammar.
i_token(2),
}),
})),
RuleEntryTypeNamed,
},
{
"rule_C",
choice({ i_token(4), blank() }),
RuleEntryTypeNamed,
},
{
"rule_D",
repeat(i_sym(2)),
RuleEntryTypeNamed,
}
// This token rule is now the third rule in the lexical grammar.
i_token(2),
}),
}))),
Variable("rule_C", VariableTypeNamed, choice({ i_token(4), blank() })),
Variable("rule_D", VariableTypeNamed, repeat(i_sym(2))),
})));
AssertThat(lexical_grammar.rules, Equals(vector<RuleEntry>({
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
// Strings become anonymous rules.
{
"ab",
str("ab"),
RuleEntryTypeAnonymous,
},
Variable("ab", VariableTypeAnonymous, str("ab")),
// Patterns become hidden rules.
{
"/cd*/",
pattern("cd*"),
RuleEntryTypeAuxiliary,
},
Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
// Rules marked as tokens become hidden rules.
{
"/(ef|gh)*/",
repeat(choice({ str("ef"), str("gh") })),
RuleEntryTypeAuxiliary,
},
Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat(choice({
str("ef"),
str("gh")
}))),
// This named rule was moved wholesale to the lexical grammar.
{
"rule_B",
pattern("ij+"),
RuleEntryTypeNamed,
},
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
// Strings become anonymous rules.
{
"kl",
str("kl"),
RuleEntryTypeAnonymous,
},
Variable("kl", VariableTypeAnonymous, str("kl")),
})));
});
it("does not create duplicate tokens in the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
seq({
str("ab"),
i_sym(0),
str("ab"),
}),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, seq({
str("ab"),
i_sym(0),
str("ab"),
})),
}, {}, {}});
SyntaxGrammar &syntax_grammar = get<0>(result);
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.rules, Equals(vector<RuleEntry>({
{
"rule_A",
seq({ i_token(0), i_sym(0), i_token(0) }),
RuleEntryTypeNamed
}
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
})));
AssertThat(lexical_grammar.rules, Equals(vector<RuleEntry>({
{
"ab",
str("ab"),
RuleEntryTypeAnonymous
},
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
})))
});
it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
seq({ i_sym(1), str("ab") }),
RuleEntryTypeNamed,
},
{
"rule_B",
str("cd"),
RuleEntryTypeNamed,
},
{
"rule_C",
seq({ str("ef"), str("cd") }),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })),
Variable("rule_B", VariableTypeNamed, str("cd")),
Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })),
}, {}, {}});
SyntaxGrammar &syntax_grammar = get<0>(result);
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.rules, Equals(vector<RuleEntry>({
{
"rule_A",
seq({ i_sym(1), i_token(0) }),
RuleEntryTypeNamed
},
{
"rule_B",
i_token(1),
RuleEntryTypeNamed
},
{
"rule_C",
seq({ i_token(2), i_token(1) }),
RuleEntryTypeNamed
},
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })),
Variable("rule_B", VariableTypeNamed, i_token(1)),
Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
})));
AssertThat(lexical_grammar.rules, Equals(vector<RuleEntry>({
{
"ab",
str("ab"),
RuleEntryTypeAnonymous
},
{
"cd",
str("cd"),
RuleEntryTypeAnonymous
},
{
"ef",
str("ef"),
RuleEntryTypeAnonymous
},
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
Variable("cd", VariableTypeAnonymous, str("cd")),
Variable("ef", VariableTypeAnonymous, str("ef")),
})));
});
it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
str("ok"),
RuleEntryTypeNamed,
},
{
"rule_B",
repeat(i_sym(0)),
RuleEntryTypeNamed,
},
{
"rule_C",
repeat(seq({ i_sym(0), i_sym(0) })),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, str("ok")),
Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
}, { str(" ") }, { { Symbol(1), Symbol(2) } }});
SyntaxGrammar &syntax_grammar = get<0>(result);
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
AssertThat(syntax_grammar.rules.size(), Equals<size_t>(2));
AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
{ Symbol(0), Symbol(1) },
})));
@ -248,11 +144,7 @@ describe("extract_tokens", []() {
describe("handling ubiquitous tokens", [&]() {
it("adds inline ubiquitous tokens to the lexical grammar's separators", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
str("x"),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, str("x")),
}, {
str("y"),
pattern("\\s+"),
@ -268,22 +160,10 @@ describe("extract_tokens", []() {
});
it("updates ubiquitous symbols according to the new symbol numbers", [&]() {
auto result = extract_tokens(InternedGrammar{ {
{
"rule_A",
seq({ str("w"), str("x"), i_sym(1) }),
RuleEntryTypeNamed
},
{
"rule_B",
str("y"),
RuleEntryTypeNamed
},
{
"rule_C",
str("z"),
RuleEntryTypeNamed
},
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })),
Variable("rule_B", VariableTypeNamed, str("y")),
Variable("rule_C", VariableTypeNamed, str("z")),
}, {
i_sym(2),
}, {}});
@ -299,16 +179,8 @@ describe("extract_tokens", []() {
it("returns an error if any ubiquitous tokens are non-token symbols", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
seq({ str("x"), i_sym(1) }),
RuleEntryTypeNamed,
},
{
"rule_B",
seq({ str("y"), str("z") }),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
}, { i_sym(1) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
@ -319,16 +191,8 @@ describe("extract_tokens", []() {
it("returns an error if any ubiquitous tokens are non-token rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
{
"rule_A",
str("x"),
RuleEntryTypeNamed,
},
{
"rule_B",
str("y"),
RuleEntryTypeNamed,
},
Variable("rule_A", VariableTypeNamed, str("x")),
Variable("rule_B", VariableTypeNamed, str("y")),
}, { choice({ i_sym(1), blank() }) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));

View file

@ -0,0 +1,179 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/built_in_symbols.h"
template<typename T, typename Func>
std::vector<typename std::result_of<Func(T)>::type>
collect(const std::vector<T> &v, Func f) {
vector<typename std::result_of<Func(T)>::type> result;
for (const T &item : v)
result.push_back(f(item));
return result;
}
START_TEST
using namespace rules;
using prepare_grammar::flatten_grammar;
using prepare_grammar::InitialSyntaxGrammar;
describe("flatten_grammar", []() {
InitialSyntaxGrammar input_grammar{{
// Choices within rules are extracted, resulting in multiple productions.
Variable("variable0", VariableTypeNamed, seq({
i_sym(1),
choice({ i_sym(2), i_sym(3) }),
i_sym(4),
})),
// When multiple precedence values are nested, the inner precedence wins.
Variable("variable1", VariableTypeNamed, seq({
i_sym(1),
prec(101, seq({
i_sym(2),
choice({
prec(102, seq({
i_sym(3),
i_sym(4)
}), AssociativityRight),
i_sym(5),
}),
i_sym(6),
})),
i_sym(7),
})),
// When a precedence is applied to the end of a rule, its value is assigned
// to the last step of the corresponding production.
Variable("variable2", VariableTypeHidden, seq({
prec(102, seq({
i_sym(1),
i_sym(2),
})),
prec(103, seq({
i_sym(3),
i_sym(4),
})),
}))
}, {}, {}};
SyntaxGrammar grammar = flatten_grammar(input_grammar);
auto get_symbol_sequences = [&](vector<Production> productions) {
return collect(productions, [](Production p) {
return collect(p, [](ProductionStep e) {
return e.symbol;
});
});
};
auto get_precedence_sequences = [&](vector<Production> productions) {
return collect(productions, [](Production p) {
return collect(p, [](ProductionStep e) {
return e.precedence;
});
});
};
auto get_associativity_sequences = [&](vector<Production> productions) {
return collect(productions, [](Production p) {
return collect(p, [](ProductionStep e) {
return e.associativity;
});
});
};
auto get_rule_id_sequences = [&](vector<Production> productions) {
return collect(productions, [](Production p) {
return collect(p, [](ProductionStep e) {
return e.rule_id;
});
});
};
it("preserves the names and types of the grammar's variables", [&]() {
AssertThat(grammar.variables[0].name, Equals("variable0"));
AssertThat(grammar.variables[1].name, Equals("variable1"));
AssertThat(grammar.variables[2].name, Equals("variable2"));
AssertThat(grammar.variables[0].type, Equals(VariableTypeNamed));
AssertThat(grammar.variables[1].type, Equals(VariableTypeNamed));
AssertThat(grammar.variables[2].type, Equals(VariableTypeHidden));
});
it("turns each variable's rule with a vector of possible symbol sequences", [&]() {
AssertThat(
get_symbol_sequences(grammar.variables[0].productions),
Equals(vector<vector<Symbol>>({
{ Symbol(1), Symbol(2), Symbol(4) },
{ Symbol(1), Symbol(3), Symbol(4) }
})));
AssertThat(
get_symbol_sequences(grammar.variables[1].productions),
Equals(vector<vector<Symbol>>({
{ Symbol(1), Symbol(2), Symbol(3), Symbol(4), Symbol(6), Symbol(7) },
{ Symbol(1), Symbol(2), Symbol(5), Symbol(6), Symbol(7) }
})));
AssertThat(
get_symbol_sequences(grammar.variables[2].productions),
Equals(vector<vector<Symbol>>({
{ Symbol(1), Symbol(2), Symbol(3), Symbol(4) },
})));
});
it("associates each symbol with the precedence binding it to its previous neighbor", [&]() {
AssertThat(
get_precedence_sequences(grammar.variables[0].productions),
Equals(vector<vector<int>>({
{ 0, 0, 0 },
{ 0, 0, 0 }
})));
AssertThat(
get_precedence_sequences(grammar.variables[1].productions),
Equals(vector<vector<int>>({
{ 0, 0, 101, 102, 101, 0 },
{ 0, 0, 101, 101, 0 }
})));
AssertThat(
get_precedence_sequences(grammar.variables[2].productions),
Equals(vector<vector<int>>({
{ 0, 102, 0, 103 },
})));
});
it("associates each symbol with the correct associativity annotation", [&]() {
Associativity none = AssociativityNone;
AssertThat(
get_associativity_sequences(grammar.variables[1].productions),
Equals(vector<vector<Associativity>>({
{ none, none, AssociativityLeft, AssociativityRight, AssociativityLeft, none },
{ none, none, AssociativityLeft, AssociativityLeft, none }
})));
});
it("associates each unique remaining subsequence of symbols and precedences with a rule_id", [&]() {
// Variable 0: only the last symbol is the same for both productions.
auto variable0_step_ids = get_rule_id_sequences(grammar.variables[0].productions);
AssertThat(variable0_step_ids[0][0], !Equals(variable0_step_ids[1][0]));
AssertThat(variable0_step_ids[0][1], !Equals(variable0_step_ids[1][1]));
AssertThat(variable0_step_ids[0][2], Equals(variable0_step_ids[1][2]));
// Variable 1: the last *two* symbols are the same for both productions.
auto variable1_step_ids = get_rule_id_sequences(grammar.variables[1].productions);
AssertThat(variable1_step_ids[0][0], !Equals(variable1_step_ids[1][0]));
AssertThat(variable1_step_ids[0][1], !Equals(variable1_step_ids[1][1]));
AssertThat(variable1_step_ids[0][4], Equals(variable1_step_ids[1][3]));
AssertThat(variable1_step_ids[0][5], Equals(variable1_step_ids[1][4]));
});
});
END_TEST

View file

@ -19,22 +19,10 @@ describe("intern_symbols", []() {
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals((GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(vector<RuleEntry>({
{
"x",
choice({ i_sym(1), i_sym(2) }),
RuleEntryTypeNamed
},
{
"y",
i_sym(2),
RuleEntryTypeNamed,
},
{
"_z",
str("stuff"),
RuleEntryTypeHidden
},
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })),
Variable("y", VariableTypeNamed, i_sym(2)),
Variable("_z", VariableTypeHidden, str("stuff")),
})));
});

View file

@ -300,10 +300,7 @@ extern const Grammar javascript = Grammar({
infix_op(">", "_expression", PREC_REL) }) },
{ "type_op", choice({
prec(PREC_REL, seq({
choice({ sym("_expression"), sym("identifier") }),
str("in"),
sym("_expression") })),
infix_op("in", "_expression", PREC_REL),
infix_op("instanceof", "_expression", PREC_REL),
prefix_op("typeof", "_expression", PREC_TYPE) }) },

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -35,7 +35,7 @@ describe("Languages", [&]() {
describe(("The " + pair.first + " parser").c_str(), [&]() {
before_each([&]() {
ts_document_set_language(doc, pair.second);
// ts_document_set_debugger(doc, log_debugger_make(true));
// ts_document_set_debugger(doc, log_debugger_make(false));
});
for (auto &entry : test_entries_for_language(pair.first)) {

View file

@ -11,7 +11,7 @@
#include "compiler/build_tables/get_metadata.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/parse_table.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
@ -66,7 +66,7 @@ class LexTableBuilder {
else if (symbol.is_token)
result.insert(LexItem(
symbol, after_separators(lex_grammar.rules[symbol.index].rule)));
symbol, after_separators(lex_grammar.variables[symbol.index].rule)));
}
return result;
}

View file

@ -6,7 +6,7 @@
namespace tree_sitter {
class LexicalGrammar;
struct LexicalGrammar;
class ParseTable;
namespace build_tables {

View file

@ -12,7 +12,8 @@
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/get_completion_status.h"
#include "compiler/build_tables/get_metadata.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
@ -35,7 +36,6 @@ class ParseTableBuilder {
const LexicalGrammar lexical_grammar;
ParseConflictManager conflict_manager;
unordered_map<const ParseItemSet, ParseStateId> parse_state_ids;
vector<vector<Symbol>> productions;
vector<pair<ParseItemSet, ParseStateId>> item_sets_to_process;
ParseTable parse_table;
std::set<string> conflicts;
@ -48,11 +48,10 @@ class ParseTableBuilder {
conflict_manager(grammar) {}
pair<ParseTable, const GrammarError *> build() {
auto start_symbol = grammar.rules.empty() ? make_shared<Symbol>(0, true)
: make_shared<Symbol>(0);
ParseItem start_item(rules::START(), start_symbol, {});
add_parse_state(
item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
ParseItem start_item(rules::START(), 0, 0, -2);
ParseItemSet start_item_set({ { start_item, { rules::END_OF_INPUT() } } });
item_set_closure(&start_item_set, grammar);
add_parse_state(start_item_set);
while (!item_sets_to_process.empty()) {
auto pair = item_sets_to_process.back();
@ -105,20 +104,41 @@ class ParseTableBuilder {
}
}
struct CompletionStatus {
bool is_done;
int precedence;
Associativity associativity;
};
CompletionStatus get_completion_status(const ParseItem &item) {
CompletionStatus result{ false, 0, AssociativityNone };
const Production &production =
grammar.productions(item.lhs())[item.production_index];
if (item.step_index == production.size()) {
result.is_done = true;
if (item.step_index > 0) {
const ProductionStep &step = production[item.step_index - 1];
result.precedence = step.precedence;
result.associativity = step.associativity;
}
}
return result;
}
void add_reduce_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
CompletionStatus completion_status = get_completion_status(item.rule);
CompletionStatus completion_status = get_completion_status(item);
if (completion_status.is_done) {
ParseAction action =
(item.lhs == rules::START())
(item.lhs() == rules::START())
? ParseAction::Accept()
: ParseAction::Reduce(item.lhs, item.consumed_symbols.size(),
: ParseAction::Reduce(Symbol(item.variable_index), item.step_index,
completion_status.precedence,
completion_status.associativity,
get_production_id(item.consumed_symbols));
item.production_index);
for (const auto &lookahead_sym : lookahead_symbols)
add_action(state_id, lookahead_sym, action, item_set);
@ -157,40 +177,42 @@ class ParseTableBuilder {
}
}
ParseAction *add_action(ParseStateId state_id, Symbol lookahead_sym,
const ParseAction &action,
ParseAction *add_action(ParseStateId state_id, Symbol lookahead,
const ParseAction &new_action,
const ParseItemSet &item_set) {
auto &current_actions = parse_table.states[state_id].actions;
auto current_entry = current_actions.find(lookahead_sym);
const auto &current_actions = parse_table.states[state_id].actions;
const auto &current_entry = current_actions.find(lookahead);
if (current_entry == current_actions.end())
return &parse_table.set_action(state_id, lookahead_sym, action);
return &parse_table.set_action(state_id, lookahead, new_action);
const ParseAction current_action = current_entry->second[0];
const ParseAction old_action = current_entry->second[0];
auto resolution =
conflict_manager.resolve(action, current_action, lookahead_sym);
conflict_manager.resolve(new_action, old_action, lookahead);
switch (resolution.second) {
case ConflictTypeNone:
if (resolution.first)
return &parse_table.set_action(state_id, lookahead_sym, action);
return &parse_table.set_action(state_id, lookahead, new_action);
break;
case ConflictTypeResolved:
if (action.type == ParseActionTypeReduce)
parse_table.fragile_production_ids.insert(action.production_id);
if (current_action.type == ParseActionTypeReduce)
parse_table.fragile_production_ids.insert(current_action.production_id);
case ConflictTypeResolved: {
if (resolution.first)
return &parse_table.set_action(state_id, lookahead_sym, action);
return &parse_table.set_action(state_id, lookahead, new_action);
if (old_action.type == ParseActionTypeReduce)
parse_table.fragile_production_ids.insert(production_id(old_action));
if (new_action.type == ParseActionTypeReduce)
parse_table.fragile_production_ids.insert(production_id(new_action));
break;
}
case ConflictTypeUnresolved: {
set<Symbol> goal_symbols = item_set_goal_symbols(item_set);
if (has_expected_conflict(goal_symbols))
return &parse_table.add_action(state_id, lookahead_sym, action);
auto old_goal_syms = goal_symbols(item_set, old_action, lookahead);
auto new_goal_syms = goal_symbols(item_set, new_action, lookahead);
if (has_expected_conflict(old_goal_syms, new_goal_syms))
return &parse_table.add_action(state_id, lookahead, new_action);
else
conflicts.insert(conflict_description(action, current_action,
lookahead_sym, goal_symbols));
conflicts.insert(conflict_description(
lookahead, old_action, old_goal_syms, new_action, new_goal_syms));
break;
}
}
@ -198,9 +220,14 @@ class ParseTableBuilder {
return nullptr;
}
bool has_expected_conflict(const set<Symbol> &symbols) {
pair<Symbol, int> production_id(const ParseAction &action) {
return { action.symbol, action.production_id };
}
bool has_expected_conflict(set<Symbol> symbols1, const set<Symbol> &symbols2) {
symbols1.insert(symbols2.begin(), symbols2.end());
for (const auto &conflicting_symbols : grammar.expected_conflicts)
if (symbols == conflicting_symbols)
if (symbols1 == conflicting_symbols)
return true;
return false;
}
@ -209,46 +236,55 @@ class ParseTableBuilder {
set<int> result;
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
if (!item.consumed_symbols.empty()) {
auto precedence_range = get_metadata(item.rule, rules::PRECEDENCE);
result.insert(precedence_range.min);
result.insert(precedence_range.max);
const Production &production =
grammar.productions(item.lhs())[item.production_index];
if (item.step_index > 0) {
if (item.step_index < production.size())
result.insert(production[item.step_index].precedence);
else
result.insert(production[item.step_index - 1].precedence);
}
}
return result;
}
set<Symbol> item_set_goal_symbols(const ParseItemSet &item_set) {
set<Symbol> goal_symbols(const ParseItemSet &item_set,
const ParseAction &action,
const Symbol &lookahead_sym) {
set<Symbol> result;
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
if (!item.consumed_symbols.empty())
result.insert(item.lhs);
switch (action.type) {
case ParseActionTypeShift: {
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const Production &production =
grammar.productions(item.lhs())[item.production_index];
if (item.step_index < production.size() &&
production[item.step_index].symbol == lookahead_sym)
result.insert(item.lhs());
}
break;
}
case ParseActionTypeReduce:
result.insert(action.symbol);
break;
default:
break;
}
return result;
}
string conflict_description(const ParseAction &new_action,
string conflict_description(const Symbol &lookahead,
const ParseAction &old_action,
const rules::Symbol &symbol,
const set<Symbol> &goal_symbols) const {
string symbols_string;
bool started = false;
for (const auto &symbol : goal_symbols) {
if (started)
symbols_string += ", ";
symbols_string += symbol_name(symbol);
started = true;
}
return "Within: " + symbols_string +
"\n"
"Lookahead: " +
symbol_name(symbol) + "\n" +
const set<Symbol> &old_goal_symbols,
const ParseAction &new_action,
const set<Symbol> &new_goal_symbols) const {
return "Lookahead: " + symbol_name(lookahead) + "\n" +
"Possible Actions:\n"
"* " +
action_description(old_action) + "\n" + "* " +
action_description(new_action);
action_description(old_action, old_goal_symbols) + "\n" + "* " +
action_description(new_action, new_goal_symbols);
}
string symbol_name(const rules::Symbol &symbol) const {
@ -260,20 +296,31 @@ class ParseTableBuilder {
else
return "";
} else if (symbol.is_token) {
return lexical_grammar.rules[symbol.index].name;
return lexical_grammar.variables[symbol.index].name;
} else {
return grammar.rules[symbol.index].name;
return grammar.variables[symbol.index].name;
}
}
string action_description(const ParseAction &action) const {
string action_description(const ParseAction &action,
const set<Symbol> &goal_symbols) const {
string symbols_string;
bool started = false;
for (const auto &symbol : goal_symbols) {
if (started)
symbols_string += ", ";
symbols_string += symbol_name(symbol);
started = true;
}
string result;
switch (action.type) {
case ParseActionTypeReduce: {
result = "Reduce";
for (const rules::Symbol &symbol : productions[action.production_id])
result += " " + symbol_name(symbol);
for (const ProductionStep &step :
grammar.productions(action.symbol)[action.production_id])
result += " " + symbol_name(step.symbol);
result += " -> " + symbol_name(action.symbol);
break;
}
@ -297,17 +344,6 @@ class ParseTableBuilder {
return result;
}
size_t get_production_id(const vector<rules::Symbol> &symbols) {
auto begin = productions.begin();
auto end = productions.end();
auto iter = find(begin, end, symbols);
if (iter == end) {
productions.push_back(symbols);
return productions.size() - 1;
}
return iter - begin;
}
};
pair<ParseTable, const GrammarError *> build_parse_table(

View file

@ -8,8 +8,8 @@
namespace tree_sitter {
class SyntaxGrammar;
class LexicalGrammar;
struct SyntaxGrammar;
struct LexicalGrammar;
namespace build_tables {

View file

@ -2,7 +2,8 @@
#include <tuple>
#include "compiler/build_tables/build_lex_table.h"
#include "compiler/build_tables/build_parse_table.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -10,8 +10,8 @@
namespace tree_sitter {
class SyntaxGrammar;
class LexicalGrammar;
struct SyntaxGrammar;
struct LexicalGrammar;
namespace build_tables {

View file

@ -1,67 +0,0 @@
#include "compiler/build_tables/first_symbols.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace build_tables {
using std::set;
using rules::Symbol;
class FirstSymbols : public rules::RuleFn<set<Symbol>> {
const SyntaxGrammar *grammar;
set<Symbol> visited_symbols;
public:
explicit FirstSymbols(const SyntaxGrammar *grammar) : grammar(grammar) {}
private:
set<Symbol> apply_to(const Symbol *rule) {
auto insertion_result = visited_symbols.insert(*rule);
if (!insertion_result.second)
return set<Symbol>();
set<Symbol> result({ *rule });
if (!rule->is_token) {
set<Symbol> &&symbols = apply(grammar->rules[rule->index].rule);
result.insert(symbols.begin(), symbols.end());
}
return result;
}
set<Symbol> apply_to(const rules::Metadata *rule) {
return apply(rule->rule);
}
set<Symbol> apply_to(const rules::Choice *rule) {
set<Symbol> result;
for (const auto &element : rule->elements) {
auto &&element_symbols = apply(element);
result.insert(element_symbols.begin(), element_symbols.end());
}
return result;
}
set<Symbol> apply_to(const rules::Seq *rule) {
auto &&result = apply(rule->left);
if (rule_can_be_blank(rule->left, *grammar)) {
auto &&right_symbols = apply(rule->right);
result.insert(right_symbols.begin(), right_symbols.end());
}
return result;
}
};
set<Symbol> first_symbols(const rule_ptr &rule, const SyntaxGrammar &grammar) {
return FirstSymbols(&grammar).apply(rule);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,24 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_
#define COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_
#include <set>
#include "compiler/rules/symbol.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace build_tables {
/*
* Returns the set of symbols that can appear at the beginning of a sentential
* form derivable from a given rule in a given grammar.
*/
std::set<rules::Symbol> first_symbols(const rule_ptr &rule,
const SyntaxGrammar &grammar);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_

View file

@ -3,11 +3,10 @@
#include <vector>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/build_tables/first_symbols.h"
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/build_tables/item.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace build_tables {
@ -17,45 +16,63 @@ using std::vector;
using std::pair;
using rules::Symbol;
const ParseItemSet item_set_closure(const ParseItem &starting_item,
const set<Symbol> &starting_lookahead_symbols,
const SyntaxGrammar &grammar) {
ParseItemSet result;
void item_set_closure(ParseItemSet *item_set, const SyntaxGrammar &grammar) {
vector<pair<ParseItem, set<Symbol>>> items_to_process;
items_to_process.push_back({ starting_item, starting_lookahead_symbols });
items_to_process.insert(items_to_process.end(), item_set->begin(),
item_set->end());
item_set->clear();
while (!items_to_process.empty()) {
ParseItem item = items_to_process.back().first;
set<Symbol> new_lookahead_symbols = items_to_process.back().second;
items_to_process.pop_back();
set<Symbol> &lookahead_symbols = result[item];
set<Symbol> &lookahead_symbols = item_set->operator[](item);
size_t previous_size = lookahead_symbols.size();
lookahead_symbols.insert(new_lookahead_symbols.begin(),
new_lookahead_symbols.end());
if (lookahead_symbols.size() == previous_size)
continue;
for (const auto &pair : sym_transitions(item.rule)) {
const Symbol &symbol = pair.first;
const rule_ptr &next_rule = pair.second;
const Production &item_production =
grammar.productions(item.lhs())[item.production_index];
if (symbol.is_token || symbol.is_built_in())
continue;
if (item.step_index == item_production.size())
continue;
set<Symbol> next_lookahead_symbols = first_symbols(next_rule, grammar);
if (rule_can_be_blank(next_rule, grammar))
next_lookahead_symbols.insert(lookahead_symbols.begin(),
lookahead_symbols.end());
Symbol symbol = item_production[item.step_index].symbol;
items_to_process.push_back(
{ ParseItem(symbol, grammar.rules[symbol.index].rule, {}),
next_lookahead_symbols });
if (symbol.is_token || symbol.is_built_in())
continue;
set<Symbol> next_lookahead_symbols;
unsigned int next_step = item.step_index + 1;
if (next_step == item_production.size()) {
next_lookahead_symbols = lookahead_symbols;
} else {
vector<Symbol> symbols_to_process({ item_production[next_step].symbol });
while (!symbols_to_process.empty()) {
Symbol following_symbol = symbols_to_process.back();
symbols_to_process.pop_back();
if (!next_lookahead_symbols.insert(following_symbol).second)
continue;
for (const auto &production : grammar.productions(following_symbol))
if (!production.empty())
symbols_to_process.push_back(production[0].symbol);
}
}
size_t i = 0;
for (const Production &production : grammar.productions(symbol)) {
if (!production.empty())
items_to_process.push_back(
{ ParseItem(symbol, i, 0, production[0].rule_id),
next_lookahead_symbols });
i++;
}
}
return result;
}
} // namespace build_tables

View file

@ -1,19 +1,16 @@
#ifndef COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_
#define COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_
#include <set>
#include "compiler/build_tables/parse_item.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
class SyntaxGrammar;
struct SyntaxGrammar;
namespace build_tables {
const ParseItemSet item_set_closure(const ParseItem &,
const std::set<rules::Symbol> &,
const SyntaxGrammar &);
void item_set_closure(ParseItemSet *, const SyntaxGrammar &);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -4,7 +4,7 @@
#include "compiler/build_tables/merge_transitions.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
@ -22,20 +22,23 @@ map<Symbol, ParseItemSet> sym_transitions(const ParseItemSet &item_set,
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
for (auto &transition : sym_transitions(item.rule)) {
vector<Symbol> consumed_symbols(item.consumed_symbols);
consumed_symbols.push_back(transition.first);
ParseItem new_item(item.lhs, transition.second, consumed_symbols);
merge_sym_transition<ParseItemSet>(
&result, { transition.first,
item_set_closure(new_item, lookahead_symbols, grammar) },
[](ParseItemSet *left, const ParseItemSet *right) {
for (auto &pair : *right)
left->operator[](pair.first)
.insert(pair.second.begin(), pair.second.end());
});
}
const Production &production =
grammar.productions(item.lhs())[item.production_index];
if (item.step_index == production.size())
continue;
const Symbol &symbol = production[item.step_index].symbol;
unsigned int step = item.step_index + 1;
int rule_id = step < production.size() ? production[step].rule_id : 0;
ParseItem new_item(item.lhs(), item.production_index, step, rule_id);
result[symbol][new_item].insert(lookahead_symbols.begin(),
lookahead_symbols.end());
}
for (auto &pair : result)
item_set_closure(&pair.second, grammar);
return result;
}

View file

@ -7,7 +7,7 @@
namespace tree_sitter {
class SyntaxGrammar;
struct SyntaxGrammar;
namespace rules {
class CharacterSet;

View file

@ -2,7 +2,7 @@
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {

View file

@ -3,7 +3,7 @@
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/build_tables/parse_item.h"
namespace tree_sitter {

View file

@ -1,34 +1,42 @@
#include "compiler/build_tables/parse_item.h"
#include <string>
#include "compiler/syntax_grammar.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace build_tables {
using std::string;
using std::vector;
using std::to_string;
using std::ostream;
using rules::Symbol;
ParseItem::ParseItem(const rules::Symbol &lhs, const rule_ptr rule,
const vector<rules::Symbol> &consumed_symbols)
: Item(lhs, rule), consumed_symbols(consumed_symbols) {}
ParseItem::ParseItem(const Symbol &lhs, unsigned int production_index,
unsigned int step_index, int rule_id)
: variable_index(lhs.index),
production_index(production_index),
step_index(step_index),
rule_id(rule_id) {}
bool ParseItem::operator==(const ParseItem &other) const {
return (lhs == other.lhs) &&
(consumed_symbols.size() == other.consumed_symbols.size()) &&
(rule == other.rule || rule->operator==(*other.rule));
return (variable_index == other.variable_index) &&
(rule_id == other.rule_id) && (step_index == other.step_index);
}
bool ParseItem::operator<(const ParseItem &other) const {
if (lhs < other.lhs)
if (variable_index < other.variable_index)
return true;
if (other.lhs < lhs)
if (variable_index > other.variable_index)
return false;
if (consumed_symbols.size() < other.consumed_symbols.size())
if (step_index < other.step_index)
return true;
if (other.consumed_symbols.size() < consumed_symbols.size())
if (step_index > other.step_index)
return false;
return rule < other.rule;
return rule_id < other.rule_id;
}
Symbol ParseItem::lhs() const {
return Symbol(variable_index);
}
} // namespace build_tables

View file

@ -10,13 +10,17 @@
namespace tree_sitter {
namespace build_tables {
class ParseItem : public Item {
class ParseItem {
public:
ParseItem(const rules::Symbol &lhs, rule_ptr rule,
const std::vector<rules::Symbol> &consumed_symbols);
ParseItem(const rules::Symbol &, unsigned int, unsigned int, int);
bool operator==(const ParseItem &other) const;
bool operator<(const ParseItem &other) const;
std::vector<rules::Symbol> consumed_symbols;
rules::Symbol lhs() const;
int variable_index;
unsigned int production_index;
unsigned int step_index;
int rule_id;
};
typedef std::map<ParseItem, std::set<rules::Symbol>> ParseItemSet;
@ -29,9 +33,8 @@ namespace std {
template <>
struct hash<tree_sitter::build_tables::ParseItem> {
size_t operator()(const tree_sitter::build_tables::ParseItem &item) const {
return hash<tree_sitter::rules::Symbol>()(item.lhs) ^
hash<tree_sitter::rule_ptr>()(item.rule) ^
hash<size_t>()(item.consumed_symbols.size());
return hash<unsigned int>()(item.variable_index) ^
hash<int>()(item.rule_id) ^ hash<unsigned int>()(item.step_index);
}
};

View file

@ -1,7 +1,5 @@
#include "compiler/build_tables/rule_can_be_blank.h"
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
@ -12,8 +10,6 @@
namespace tree_sitter {
namespace build_tables {
using std::set;
class CanBeBlank : public rules::RuleFn<bool> {
protected:
bool apply_to(const rules::Blank *) {
@ -40,35 +36,9 @@ class CanBeBlank : public rules::RuleFn<bool> {
}
};
class CanBeBlankRecursive : public CanBeBlank {
const SyntaxGrammar *grammar;
set<rules::Symbol> visited_symbols;
using CanBeBlank::visit;
public:
explicit CanBeBlankRecursive(const SyntaxGrammar *grammar)
: grammar(grammar) {}
private:
using CanBeBlank::apply_to;
bool apply_to(const rules::Symbol *rule) {
if (visited_symbols.find(*rule) == visited_symbols.end()) {
visited_symbols.insert(*rule);
return !rule->is_token && apply(grammar->rules[rule->index].rule);
} else {
return false;
}
}
};
bool rule_can_be_blank(const rule_ptr &rule) {
return CanBeBlank().apply(rule);
}
bool rule_can_be_blank(const rule_ptr &rule, const SyntaxGrammar &grammar) {
return CanBeBlankRecursive(&grammar).apply(rule);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -4,13 +4,9 @@
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace build_tables {
bool rule_can_be_blank(const rule_ptr &rule);
bool rule_can_be_blank(const rule_ptr &rule, const SyntaxGrammar &grammar);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -2,7 +2,8 @@
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/build_tables/build_tables.h"
#include "compiler/generate_code/c_code.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {

View file

@ -7,7 +7,8 @@
#include "compiler/generate_code/c_code.h"
#include "compiler/lex_table.h"
#include "compiler/parse_table.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/util/string_helpers.h"
@ -15,19 +16,15 @@ namespace tree_sitter {
namespace generate_code {
using std::function;
using std::map;
using std::pair;
using std::set;
using std::string;
using std::to_string;
using std::vector;
using util::escape_char;
static RuleEntry ERROR_ENTRY{
"error", rule_ptr(), RuleEntryTypeNamed,
};
static RuleEntry EOF_ENTRY{
"end", rule_ptr(), RuleEntryTypeAuxiliary,
};
static Variable ERROR_ENTRY("error", VariableTypeNamed, rule_ptr());
static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
static const map<char, string> REPLACEMENTS({
{ '~', "TILDE" },
@ -149,15 +146,15 @@ class CCodeGenerator {
for (const auto &symbol : parse_table.symbols) {
line("[" + symbol_id(symbol) + "] = ");
switch (entry_for_symbol(symbol).type) {
case RuleEntryTypeNamed:
switch (symbol_type(symbol)) {
case VariableTypeNamed:
add("TSNodeTypeNamed,");
break;
case RuleEntryTypeAnonymous:
case VariableTypeAnonymous:
add("TSNodeTypeAnonymous,");
break;
case RuleEntryTypeHidden:
case RuleEntryTypeAuxiliary:
case VariableTypeHidden:
case VariableTypeAuxiliary:
add("TSNodeTypeHidden,");
break;
}
@ -338,15 +335,18 @@ class CCodeGenerator {
}
string symbol_id(const rules::Symbol &symbol) {
RuleEntry entry = entry_for_symbol(symbol);
string name = sanitize_name(entry.name);
if (symbol.is_built_in())
return "ts_builtin_sym_" + name;
if (symbol == rules::ERROR())
return "ts_builtin_sym_error";
if (symbol == rules::END_OF_INPUT())
return "ts_builtin_sym_end";
switch (entry.type) {
case RuleEntryTypeAuxiliary:
auto entry = entry_for_symbol(symbol);
string name = sanitize_name(entry.first);
switch (entry.second) {
case VariableTypeAuxiliary:
return "aux_sym_" + name;
case RuleEntryTypeAnonymous:
case VariableTypeAnonymous:
return "anon_sym_" + name;
default:
return "sym_" + name;
@ -358,26 +358,30 @@ class CCodeGenerator {
return "ERROR";
if (symbol == rules::END_OF_INPUT())
return "END";
return entry_for_symbol(symbol).name;
return entry_for_symbol(symbol).first;
}
const RuleEntry &entry_for_symbol(const rules::Symbol &symbol) {
VariableType symbol_type(const rules::Symbol &symbol) {
if (symbol == rules::ERROR())
return ERROR_ENTRY;
return VariableTypeNamed;
if (symbol == rules::END_OF_INPUT())
return EOF_ENTRY;
if (symbol.is_token)
return lexical_grammar.rules[symbol.index];
else
return syntax_grammar.rules[symbol.index];
return VariableTypeHidden;
return entry_for_symbol(symbol).second;
}
string rule_name(const rules::Symbol &symbol) {
return entry_for_symbol(symbol).name;
pair<string, VariableType> entry_for_symbol(const rules::Symbol &symbol) {
if (symbol.is_token) {
const Variable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
} else {
const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
}
bool reduce_action_is_fragile(const ParseAction &action) const {
return parse_table.fragile_production_ids.find(action.production_id) !=
return parse_table.fragile_production_ids.find(
{ action.symbol, action.production_id }) !=
parse_table.fragile_production_ids.end();
}

View file

@ -5,10 +5,10 @@
namespace tree_sitter {
struct LexicalGrammar;
struct SyntaxGrammar;
class LexTable;
class LexicalGrammar;
class ParseTable;
class SyntaxGrammar;
namespace generate_code {

View file

@ -0,0 +1,19 @@
#ifndef COMPILER_LEXICAL_GRAMMAR_H_
#define COMPILER_LEXICAL_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/variable.h"
namespace tree_sitter {
struct LexicalGrammar {
std::vector<Variable> variables;
std::vector<rule_ptr> separators;
};
} // namespace tree_sitter
#endif // COMPILER_LEXICAL_GRAMMAR_H_

View file

@ -60,7 +60,7 @@ ParseAction ParseAction::ReduceExtra(Symbol symbol) {
ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count,
int precedence, Associativity associativity,
int production_id) {
unsigned int production_id) {
return ParseAction(ParseActionTypeReduce, 0, symbol, consumed_symbol_count,
{ precedence }, associativity, production_id);
}

View file

@ -35,7 +35,8 @@ class ParseAction {
static ParseAction Shift(ParseStateId state_index,
std::set<int> precedence_values);
static ParseAction Reduce(rules::Symbol symbol, size_t consumed_symbol_count,
int precedence, Associativity, int production_id);
int precedence, Associativity,
unsigned int production_id);
static ParseAction ShiftExtra();
static ParseAction ReduceExtra(rules::Symbol symbol);
bool operator==(const ParseAction &) const;
@ -87,7 +88,7 @@ class ParseTable {
std::vector<ParseState> states;
std::set<rules::Symbol> symbols;
std::set<int> fragile_production_ids;
std::set<std::pair<rules::Symbol, unsigned int>> fragile_production_ids;
};
} // namespace tree_sitter

View file

@ -2,7 +2,7 @@
#include <vector>
#include <string>
#include <utility>
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
@ -42,12 +42,10 @@ class ExpandRepeats : public rules::IdentityRuleFn {
rule_name + string("_repeat") + to_string(++repeat_count);
Symbol repeat_symbol(offset + index);
existing_repeats.push_back({ rule->copy(), repeat_symbol });
aux_rules.push_back({
helper_rule_name,
aux_rules.push_back(Variable(
helper_rule_name, VariableTypeAuxiliary,
Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
make_shared<Blank>() }) }),
RuleEntryTypeAuxiliary,
});
make_shared<Blank>() }) })));
return repeat_symbol.copy();
}
@ -64,21 +62,21 @@ class ExpandRepeats : public rules::IdentityRuleFn {
return apply(rule);
}
vector<RuleEntry> aux_rules;
vector<Variable> aux_rules;
};
SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
SyntaxGrammar result;
result.rules = grammar.rules;
InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
InitialSyntaxGrammar result;
result.variables = grammar.variables;
result.ubiquitous_tokens = grammar.ubiquitous_tokens;
result.expected_conflicts = grammar.expected_conflicts;
ExpandRepeats expander(result.rules.size());
for (auto &rule_entry : result.rules)
rule_entry.rule = expander.expand(rule_entry.rule, rule_entry.name);
ExpandRepeats expander(result.variables.size());
for (auto &variable : result.variables)
variable.rule = expander.expand(variable.rule, variable.name);
result.rules.insert(result.rules.end(), expander.aux_rules.begin(),
expander.aux_rules.end());
result.variables.insert(result.variables.end(), expander.aux_rules.begin(),
expander.aux_rules.end());
return result;
}

View file

@ -4,12 +4,11 @@
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace prepare_grammar {
SyntaxGrammar expand_repeats(const SyntaxGrammar &);
struct InitialSyntaxGrammar;
InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -3,7 +3,7 @@
#include <string>
#include <utility>
#include <map>
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/pattern.h"
#include "compiler/rules/string.h"
@ -68,11 +68,11 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
LexicalGrammar result;
ExpandTokens expander;
for (auto &entry : grammar.rules) {
auto rule = expander.apply(entry.rule);
for (const Variable &variable : grammar.variables) {
auto rule = expander.apply(variable.rule);
if (expander.error)
return { result, expander.error };
result.rules.push_back({ entry.name, rule, entry.type });
result.variables.push_back(Variable(variable.name, variable.type, rule));
}
for (auto &sep : grammar.separators) {

View file

@ -6,7 +6,7 @@
namespace tree_sitter {
class LexicalGrammar;
struct LexicalGrammar;
namespace prepare_grammar {

View file

@ -0,0 +1,57 @@
#include "compiler/prepare_grammar/extract_choices.h"
#include <vector>
#include <memory>
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/repeat.h"
namespace tree_sitter {
namespace prepare_grammar {
using std::make_shared;
using std::vector;
class ExtractChoices : public rules::RuleFn<vector<rule_ptr>> {
vector<rule_ptr> default_apply(const Rule *rule) {
return vector<rule_ptr>({ rule->copy() });
}
vector<rule_ptr> apply_to(const rules::Seq *rule) {
vector<rule_ptr> result;
for (auto left_entry : apply(rule->left))
for (auto right_entry : apply(rule->right))
result.push_back(rules::Seq::build({ left_entry, right_entry }));
return result;
}
vector<rule_ptr> apply_to(const rules::Metadata *rule) {
vector<rule_ptr> result;
for (auto entry : apply(rule->rule))
result.push_back(make_shared<rules::Metadata>(entry, rule->value));
return result;
}
vector<rule_ptr> apply_to(const rules::Choice *rule) {
vector<rule_ptr> result;
for (auto element : rule->elements)
for (auto entry : apply(element))
result.push_back(entry);
return result;
}
vector<rule_ptr> apply_to(const rules::Repeat *rule) {
vector<rule_ptr> result;
for (auto element : apply(rule->content))
result.push_back(make_shared<rules::Repeat>(element));
return result;
}
};
std::vector<rule_ptr> extract_choices(const rule_ptr &rule) {
return ExtractChoices().apply(rule);
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,15 @@
#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_
#include <vector>
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace prepare_grammar {
std::vector<rule_ptr> extract_choices(const rule_ptr &);
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_

View file

@ -5,7 +5,8 @@
#include <string>
#include <tuple>
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/string.h"
@ -56,7 +57,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
class TokenExtractor : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to_token(const Rule *input, RuleEntryType entry_type) {
rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].rule->operator==(*input)) {
token_usage_counts[i]++;
@ -65,31 +66,29 @@ class TokenExtractor : public rules::IdentityRuleFn {
rule_ptr rule = input->copy();
size_t index = tokens.size();
tokens.push_back({
token_description(rule), rule, entry_type,
});
tokens.push_back(Variable(token_description(rule), entry_type, rule));
token_usage_counts.push_back(1);
return make_shared<Symbol>(index, true);
}
rule_ptr apply_to(const rules::String *rule) {
return apply_to_token(rule, RuleEntryTypeAnonymous);
return apply_to_token(rule, VariableTypeAnonymous);
}
rule_ptr apply_to(const rules::Pattern *rule) {
return apply_to_token(rule, RuleEntryTypeAuxiliary);
return apply_to_token(rule, VariableTypeAuxiliary);
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (rule->value_for(rules::IS_TOKEN) > 0)
return apply_to_token(rule->rule.get(), RuleEntryTypeAuxiliary);
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
else
return rules::IdentityRuleFn::apply_to(rule);
}
public:
vector<size_t> token_usage_counts;
vector<RuleEntry> tokens;
vector<Variable> tokens;
};
static const GrammarError *ubiq_token_err(const string &message) {
@ -97,9 +96,9 @@ static const GrammarError *ubiq_token_err(const string &message) {
"Not a token: " + message);
}
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const InternedGrammar &grammar) {
SyntaxGrammar syntax_grammar;
InitialSyntaxGrammar syntax_grammar;
LexicalGrammar lexical_grammar;
SymbolReplacer symbol_replacer;
TokenExtractor extractor;
@ -107,31 +106,30 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
/*
* First, extract all of the grammar's tokens into the lexical grammar.
*/
vector<RuleEntry> processed_rules;
for (const RuleEntry &entry : grammar.rules)
processed_rules.push_back({
entry.name, extractor.apply(entry.rule), entry.type,
});
lexical_grammar.rules = extractor.tokens;
vector<Variable> processed_variables;
for (const Variable &variable : grammar.variables)
processed_variables.push_back(
Variable(variable.name, variable.type, extractor.apply(variable.rule)));
lexical_grammar.variables = extractor.tokens;
/*
* If a rule's entire content was extracted as a token and that token didn't
* appear within any other rule, then remove that rule from the syntax
* If a variable's entire rule was extracted as a token and that token didn't
* appear within any other rule, then remove that variable from the syntax
* grammar, giving its name to the token in the lexical grammar. Any symbols
* that pointed to that rule will need to be updated to point to the rule in
* the lexical grammar. Symbols that pointed to later rules will need to have
* their indices decremented.
* that pointed to that variable will need to be updated to point to the
* variable in the lexical grammar. Symbols that pointed to later variables
* will need to have their indices decremented.
*/
size_t i = 0;
for (const RuleEntry &entry : processed_rules) {
auto symbol = dynamic_pointer_cast<const Symbol>(entry.rule);
for (const Variable &variable : processed_variables) {
auto symbol = dynamic_pointer_cast<const Symbol>(variable.rule);
if (symbol.get() && symbol->is_token && !symbol->is_built_in() &&
extractor.token_usage_counts[symbol->index] == 1) {
lexical_grammar.rules[symbol->index].type = entry.type;
lexical_grammar.rules[symbol->index].name = entry.name;
lexical_grammar.variables[symbol->index].type = variable.type;
lexical_grammar.variables[symbol->index].name = variable.name;
symbol_replacer.replacements.insert({ Symbol(i), *symbol });
} else {
syntax_grammar.rules.push_back(entry);
syntax_grammar.variables.push_back(variable);
}
i++;
}
@ -139,14 +137,14 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
/*
* Perform any replacements of symbols needed based on the previous step.
*/
for (RuleEntry &entry : syntax_grammar.rules)
entry.rule = symbol_replacer.apply(entry.rule);
for (Variable &variable : syntax_grammar.variables)
variable.rule = symbol_replacer.apply(variable.rule);
for (auto &symbol_set : grammar.expected_conflicts) {
set<Symbol> new_symbol_set;
for (const Symbol &symbol : symbol_set)
new_symbol_set.insert(symbol_replacer.replace_symbol(symbol));
syntax_grammar.expected_conflicts.insert(new_symbol_set);
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
ConflictSet new_conflict_set;
for (const Symbol &symbol : conflict_set)
new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
syntax_grammar.expected_conflicts.insert(new_conflict_set);
}
/*
@ -171,7 +169,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
if (!new_symbol.is_token)
return make_tuple(
syntax_grammar, lexical_grammar,
ubiq_token_err(syntax_grammar.rules[new_symbol.index].name));
ubiq_token_err(syntax_grammar.variables[new_symbol.index].name));
syntax_grammar.ubiquitous_tokens.insert(new_symbol);
}

View file

@ -3,18 +3,15 @@
#include <tuple>
#include "tree_sitter/compiler.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/prepare_grammar/interned_grammar.h"
namespace tree_sitter {
class Grammar;
class SyntaxGrammar;
class LexicalGrammar;
namespace prepare_grammar {
std::tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const InternedGrammar &);
std::tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *>
extract_tokens(const InternedGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,154 @@
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/prepare_grammar/extract_choices.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/built_in_symbols.h"
#include <string>
#include <algorithm>
namespace tree_sitter {
namespace prepare_grammar {
using std::find;
using std::string;
using std::vector;
class FlattenRule : public rules::RuleFn<void> {
public:
bool has_pending_precedence;
int pending_precedence;
vector<int> precedence_stack;
bool has_pending_associativity;
Associativity pending_associativity;
vector<Associativity> associativity_stack;
Production production;
FlattenRule()
: has_pending_precedence(false),
pending_precedence(0),
has_pending_associativity(false),
pending_associativity(AssociativityNone) {}
void apply_to(const rules::Symbol *sym) {
production.push_back(
ProductionStep(*sym, current_precedence(), current_associativity()));
if (has_pending_precedence) {
precedence_stack.push_back(pending_precedence);
has_pending_precedence = false;
}
if (has_pending_associativity) {
associativity_stack.push_back(pending_associativity);
has_pending_associativity = false;
}
}
void apply_to(const rules::Metadata *metadata) {
int precedence = metadata->value_for(rules::PRECEDENCE);
int associativity = metadata->value_for(rules::ASSOCIATIVITY);
if (precedence != 0) {
pending_precedence = precedence;
has_pending_precedence = true;
}
if (associativity != 0) {
pending_associativity = static_cast<Associativity>(associativity);
has_pending_associativity = true;
}
apply(metadata->rule);
if (precedence != 0)
precedence_stack.pop_back();
if (associativity != 0)
associativity_stack.pop_back();
}
void apply_to(const rules::Seq *seq) {
apply(seq->left);
apply(seq->right);
}
private:
int current_precedence() {
if (precedence_stack.empty())
return 0;
else
return precedence_stack.back();
}
Associativity current_associativity() {
if (associativity_stack.empty())
return AssociativityNone;
else
return associativity_stack.back();
}
};
Production flatten_rule(const rule_ptr &rule) {
FlattenRule flattener;
flattener.apply(rule);
return flattener.production;
}
struct ProductionSlice {
vector<ProductionStep>::const_iterator start;
vector<ProductionStep>::const_iterator end;
bool operator==(const ProductionSlice &other) const {
if (end - start != other.end - other.start)
return false;
for (auto iter1 = start, iter2 = other.start; iter1 != end; ++iter1, ++iter2)
if (!(iter1->symbol == iter2->symbol &&
iter1->precedence == iter2->precedence &&
iter1->associativity == iter2->associativity))
return false;
return true;
}
};
void assign_rule_ids(Production *production,
vector<ProductionSlice> *unique_slices) {
auto end = production->end();
for (auto iter = production->begin(); iter != end; ++iter) {
ProductionSlice slice{ iter, end };
auto existing_id =
find(unique_slices->cbegin(), unique_slices->cend(), slice);
if (existing_id == unique_slices->end()) {
unique_slices->push_back(slice);
iter->rule_id = unique_slices->size();
} else {
iter->rule_id = existing_id - unique_slices->cbegin() + 1;
}
}
}
SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) {
SyntaxGrammar result;
result.expected_conflicts = grammar.expected_conflicts;
result.ubiquitous_tokens = grammar.ubiquitous_tokens;
for (const Variable &variable : grammar.variables) {
vector<Production> productions;
for (const rule_ptr &rule_component : extract_choices(variable.rule))
productions.push_back(flatten_rule(rule_component));
result.variables.push_back(
SyntaxVariable(variable.name, variable.type, productions));
}
vector<ProductionSlice> unique_slices;
for (SyntaxVariable &variable : result.variables)
for (Production &production : variable.productions)
assign_rule_ids(&production, &unique_slices);
return result;
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,13 @@
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {
struct InitialSyntaxGrammar;
SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,24 @@
#ifndef COMPILER_INITIAL_SYNTAX_GRAMMAR_H_
#define COMPILER_INITIAL_SYNTAX_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/variable.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {
struct InitialSyntaxGrammar {
std::vector<Variable> variables;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<ConflictSet> expected_conflicts;
};
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_INITIAL_SYNTAX_GRAMMAR_H_

View file

@ -56,10 +56,9 @@ pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &gramma
if (!interner.missing_rule_name.empty())
return { result, missing_rule_error(interner.missing_rule_name) };
result.rules.push_back({
pair.first, new_rule,
pair.first[0] == '_' ? RuleEntryTypeHidden : RuleEntryTypeNamed,
});
result.variables.push_back(Variable(
pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
new_rule));
}
for (auto &rule : grammar.ubiquitous_tokens()) {

View file

@ -7,9 +7,6 @@
#include "compiler/prepare_grammar/interned_grammar.h"
namespace tree_sitter {
class Grammar;
namespace prepare_grammar {
std::pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &);

View file

@ -5,15 +5,16 @@
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/variable.h"
namespace tree_sitter {
namespace prepare_grammar {
struct InternedGrammar {
std::vector<RuleEntry> rules;
std::vector<Variable> variables;
std::vector<rule_ptr> ubiquitous_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
std::set<ConflictSet> expected_conflicts;
};
} // namespace prepare_grammar

View file

@ -1,10 +1,12 @@
#include "compiler/prepare_grammar/prepare_grammar.h"
#include <tuple>
#include "compiler/prepare_grammar/expand_repeats.h"
#include "compiler/prepare_grammar/expand_tokens.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {
@ -28,7 +30,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
// Replace `Repeat` rules with pairs of recursive rules
SyntaxGrammar syntax_grammar = expand_repeats(get<0>(extract_result));
InitialSyntaxGrammar syntax_grammar = expand_repeats(get<0>(extract_result));
// Expand `String` and `Pattern` rules into full rule trees
auto expand_tokens_result = expand_tokens(get<1>(extract_result));
@ -37,7 +39,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
if (error)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
return make_tuple(syntax_grammar, lex_grammar, nullptr);
return make_tuple(flatten_grammar(syntax_grammar), lex_grammar, nullptr);
}
} // namespace prepare_grammar

View file

@ -2,7 +2,8 @@
#define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_
#include <tuple>
#include "compiler/prepared_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {

View file

@ -1,40 +0,0 @@
#ifndef COMPILER_PREPARED_GRAMMAR_H_
#define COMPILER_PREPARED_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
enum RuleEntryType {
RuleEntryTypeNamed,
RuleEntryTypeAnonymous,
RuleEntryTypeHidden,
RuleEntryTypeAuxiliary,
};
struct RuleEntry {
std::string name;
rule_ptr rule;
RuleEntryType type;
};
class SyntaxGrammar {
public:
std::vector<RuleEntry> rules;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
};
class LexicalGrammar {
public:
std::vector<RuleEntry> rules;
std::vector<rule_ptr> separators;
};
} // namespace tree_sitter
#endif // COMPILER_PREPARED_GRAMMAR_H_

View file

@ -15,5 +15,9 @@ Symbol START() {
return Symbol(-3);
}
Symbol NONE() {
return Symbol(-4);
}
} // namespace rules
} // namespace tree_sitter

View file

@ -9,6 +9,7 @@ namespace rules {
Symbol ERROR();
Symbol END_OF_INPUT();
Symbol START();
Symbol NONE();
} // namespace rules
} // namespace tree_sitter

View file

@ -130,6 +130,79 @@ class RuleFn : private Visitor {
T value_;
};
template <>
class RuleFn<void> : private Visitor {
public:
void apply(const rule_ptr &rule) {
rule->accept(this);
}
protected:
virtual void default_apply(const Rule *rule) {}
virtual void apply_to(const Blank *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const CharacterSet *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Choice *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Metadata *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Pattern *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Repeat *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Seq *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const String *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const NamedSymbol *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const Symbol *rule) {
return default_apply((const Rule *)rule);
}
void visit(const Blank *rule) {
apply_to(rule);
}
void visit(const CharacterSet *rule) {
apply_to(rule);
}
void visit(const Choice *rule) {
apply_to(rule);
}
void visit(const Metadata *rule) {
apply_to(rule);
}
void visit(const Pattern *rule) {
apply_to(rule);
}
void visit(const Repeat *rule) {
apply_to(rule);
}
void visit(const Seq *rule) {
apply_to(rule);
}
void visit(const String *rule) {
apply_to(rule);
}
void visit(const NamedSymbol *rule) {
apply_to(rule);
}
void visit(const Symbol *rule) {
apply_to(rule);
}
};
class IdentityRuleFn : public RuleFn<rule_ptr> {
protected:
virtual rule_ptr default_apply(const Rule *rule);

View file

@ -0,0 +1,63 @@
#include "compiler/syntax_grammar.h"
#include <vector>
#include <string>
#include <utility>
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
namespace tree_sitter {
using std::string;
using std::to_string;
using std::pair;
using std::vector;
using std::set;
static const vector<Production> START_PRODUCTIONS_TOKEN_ONLY({
Production({ ProductionStep(rules::Symbol(0, true), 0, AssociativityNone) }),
});
static const vector<Production> START_PRODUCTIONS({
Production({ ProductionStep(rules::Symbol(0), 0, AssociativityNone) }),
});
static const vector<Production> NO_PRODUCTIONS({});
SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
const vector<Production> &productions)
: name(name), productions(productions), type(type) {}
ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
Associativity associativity)
: symbol(symbol),
precedence(precedence),
associativity(associativity),
rule_id(0) {}
ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
Associativity associativity, int rule_id)
: symbol(symbol),
precedence(precedence),
associativity(associativity),
rule_id(rule_id) {}
bool ProductionStep::operator==(const ProductionStep &other) const {
return symbol == other.symbol && precedence == other.precedence &&
rule_id == other.rule_id && associativity == other.associativity;
}
const vector<Production> &SyntaxGrammar::productions(
const rules::Symbol &symbol) const {
if (symbol == rules::START()) {
if (variables.empty())
return START_PRODUCTIONS_TOKEN_ONLY;
else
return START_PRODUCTIONS;
} else if (symbol.is_built_in() || symbol.is_token) {
return NO_PRODUCTIONS;
} else {
return variables[symbol.index].productions;
}
}
} // namespace tree_sitter

View file

@ -0,0 +1,47 @@
#ifndef COMPILER_PREPARED_GRAMMAR_H_
#define COMPILER_PREPARED_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/variable.h"
namespace tree_sitter {
struct ProductionStep {
ProductionStep(const rules::Symbol &, int, Associativity);
ProductionStep(const rules::Symbol &, int, Associativity, int);
bool operator==(const ProductionStep &) const;
rules::Symbol symbol;
int precedence;
Associativity associativity;
int rule_id;
};
typedef std::vector<ProductionStep> Production;
struct SyntaxVariable {
SyntaxVariable(const std::string &, VariableType,
const std::vector<Production> &);
std::string name;
std::vector<Production> productions;
VariableType type;
};
typedef std::set<rules::Symbol> ConflictSet;
struct SyntaxGrammar {
const std::vector<Production> &productions(const rules::Symbol &) const;
std::vector<SyntaxVariable> variables;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<ConflictSet> expected_conflicts;
};
} // namespace tree_sitter
#endif // COMPILER_PREPARED_GRAMMAR_H_

11
src/compiler/variable.cc Normal file
View file

@ -0,0 +1,11 @@
#include "compiler/variable.h"
#include <string>
namespace tree_sitter {
using std::string;
Variable::Variable(const string &name, VariableType type, const rule_ptr &rule)
: name(name), rule(rule), type(type) {}
} // namespace tree_sitter

26
src/compiler/variable.h Normal file
View file

@ -0,0 +1,26 @@
#ifndef COMPILER_VARIABLE_H_
#define COMPILER_VARIABLE_H_
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
enum VariableType {
VariableTypeHidden,
VariableTypeAuxiliary,
VariableTypeAnonymous,
VariableTypeNamed,
};
struct Variable {
Variable(const std::string &, VariableType, const rule_ptr &);
std::string name;
rule_ptr rule;
VariableType type;
};
} // namespace tree_sitter
#endif // COMPILER_VARIABLE_H_