Unify ubiquitous tokens and lexical separators in API

This commit is contained in:
Max Brunsfeld 2014-09-07 22:16:45 -07:00
parent a46f9d950c
commit 1ff7cedf40
29 changed files with 341 additions and 267 deletions

View file

@ -28,6 +28,9 @@ extern const Grammar arithmetic = Grammar({
{ "variable", pattern("\\a[\\w_]*") },
{ "comment", pattern("#.*") },
}).ubiquitous_tokens({ "comment" });
}).ubiquitous_tokens({
sym("comment"),
pattern("\\s"),
});
} // namespace tree_sitter_examples

View file

@ -166,8 +166,10 @@ extern const Grammar golang = Grammar({
{ "_identifier", pattern("\\a[\\w_]*") },
{ "number", pattern("\\d+(\\.\\d+)?") },
{ "comment", keypattern("//[^\n]*") },
})
.ubiquitous_tokens({ "comment", "_line_break" })
.separators({ ' ', '\t', '\r' });
}).ubiquitous_tokens({
sym("comment"),
sym("_line_break"),
pattern("[ \t\r]"),
});
} // namespace tree_sitter_examples

View file

@ -213,8 +213,10 @@ extern const Grammar javascript = Grammar({
{ "null", keyword("null") },
{ "true", keyword("true") },
{ "false", keyword("false") },
})
.ubiquitous_tokens({ "comment", "_line_break" })
.separators({ ' ', '\t', '\r' });
}).ubiquitous_tokens({
sym("comment"),
sym("_line_break"),
pattern("[ \t\r]"),
});
} // namespace tree_sitter_examples

View file

@ -6,7 +6,7 @@ namespace tree_sitter_examples {
using tree_sitter::Grammar;
using namespace tree_sitter::rules;
extern const Grammar json({
extern const Grammar json = Grammar({
{ "value", choice({
sym("object"),
sym("array"),
@ -25,6 +25,8 @@ extern const Grammar json({
{ "null", keyword("null") },
{ "true", keyword("true") },
{ "false", keyword("false") },
}).ubiquitous_tokens({
pattern("\\s"),
});
} // namespace tree_sitter_examples

View file

@ -33,8 +33,7 @@ std::ostream &operator<<(std::ostream &stream, const rules::rule_ptr &rule);
class Grammar {
const std::vector<std::pair<std::string, rules::rule_ptr> > rules_;
std::set<std::string> ubiquitous_tokens_;
std::set<char> separators_;
std::set<rules::rule_ptr> ubiquitous_tokens_;
public:
Grammar(const std::vector<std::pair<std::string, rules::rule_ptr> > &rules);
@ -42,10 +41,9 @@ class Grammar {
std::string start_rule_name() const;
const rules::rule_ptr rule(const std::string &name) const;
const std::vector<std::pair<std::string, rules::rule_ptr> > &rules() const;
const std::set<std::string> &ubiquitous_tokens() const;
Grammar &ubiquitous_tokens(const std::set<std::string> &ubiquitous_tokens);
const std::set<char> &separators() const;
Grammar &separators(const std::set<char> &separators);
const std::set<rules::rule_ptr> &ubiquitous_tokens() const;
Grammar &ubiquitous_tokens(
const std::set<rules::rule_ptr> &ubiquitous_tokens);
};
struct Conflict {
@ -57,7 +55,8 @@ struct Conflict {
enum GrammarErrorType {
GrammarErrorTypeRegex,
GrammarErrorTypeUndefinedSymbol
GrammarErrorTypeUndefinedSymbol,
GrammarErrorTypeInvalidUbiquitousToken
};
class GrammarError {

View file

@ -21,7 +21,7 @@ describe("resolving parse conflicts", []() {
{ "token1", pattern("[a-c]") },
{ "token2", pattern("[b-d]") },
{ "token3", keyword("stuff") },
}, {}, set<char>());
}, {}, {});
describe("lexical conflicts", [&]() {
Symbol sym1(0, SymbolOptionToken);

View file

@ -19,13 +19,13 @@ template<typename K>
class rule_map : public map<K, rule_ptr> {
public:
bool operator==(const map<K, rule_ptr> &other) const {
if (this->size() != other.size()) return false;
for (const auto &pair : *this) {
auto other_pair = other.find(pair.first);
if (other_pair == other.end()) return false;
if (!pair.second->operator==(*other_pair->second)) return false;
}
return true;
if (this->size() != other.size()) return false;
for (const auto &pair : *this) {
auto other_pair = other.find(pair.first);
if (other_pair == other.end()) return false;
if (!pair.second->operator==(*other_pair->second)) return false;
}
return true;
}
rule_map(const initializer_list<pair<const K, rule_ptr>> &list) : map<K, rule_ptr>(list) {}
@ -34,19 +34,35 @@ class rule_map : public map<K, rule_ptr> {
class rule_list : public vector<pair<string, rule_ptr>> {
public:
bool operator==(const vector<pair<string, rule_ptr>> &other) const {
if (this->size() != other.size()) return false;
for (size_t i = 0; i < this->size(); i++) {
auto pair = this->operator[](i);
auto other_pair = other[i];
if (!pair.second->operator==(*other_pair.second))
return false;
}
return true;
if (this->size() != other.size()) return false;
for (size_t i = 0; i < this->size(); i++) {
auto pair = this->operator[](i);
auto other_pair = other[i];
if (!pair.second->operator==(*other_pair.second))
return false;
}
return true;
}
rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
vector<pair<string, rule_ptr>>(list) {}
};
class rule_vector : public vector<rule_ptr> {
public:
bool operator==(const vector<rule_ptr> &other) const {
if (this->size() != other.size()) return false;
for (size_t i = 0; i < this->size(); i++) {
auto rule = this->operator[](i);
auto other_rule = other[i];
if (!rule->operator==(*rule))
return false;
}
return true;
}
rule_vector(const initializer_list<rule_ptr> &list) :
vector<rule_ptr>(list) {}
};
#endif // HELPERS_CONTAINERS_H_

View file

@ -77,7 +77,7 @@ describe("expand_repeats", []() {
auto match = expand_repeats(grammar);
AssertThat(match.rules, Equals(rule_list({
{ "rule0", seq({
{ "rule0", seq({
choice({ i_aux_sym(0), blank() }),
choice({ i_aux_sym(1), blank() }) }) },
})));

View file

@ -1,7 +1,6 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/helpers/containers.h"
@ -9,175 +8,226 @@ START_TEST
using namespace rules;
using prepare_grammar::extract_tokens;
using prepare_grammar::InternedGrammar;
describe("extracting tokens from a grammar", []() {
describe("extract_tokens", []() {
it("moves string rules into the lexical grammar", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> result =
extract_tokens(Grammar({
{ "rule_A", seq({ str("ab"), i_sym(0) }) }
},
set<Symbol>(),
set<char>()
});
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(result.first.aux_rules, IsEmpty())
AssertThat(result.second.rules, IsEmpty())
AssertThat(result.second.aux_rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "'ab'", str("ab") },
})));
});
it("moves pattern rules into the lexical grammar", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
{ "rule_A", seq({ pattern("a+"), i_sym(0) }) }
},
set<Symbol>(),
set<char>()
});
auto result = extract_tokens(Grammar({
{ "rule_A", seq({ pattern("a+"), i_sym(0) }) }
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(result.first.aux_rules, IsEmpty())
AssertThat(result.second.rules, IsEmpty())
AssertThat(result.second.aux_rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "/a+/", pattern("a+") },
})));
});
it("moves other rules marked as tokens into the lexical grammar", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
{ "rule_A", seq({
token(seq({ pattern("."), choice({ str("a"), str("b") }) })),
i_sym(0) }) }
},
set<Symbol>(),
set<char>()
});
auto result = extract_tokens(Grammar({
{ "rule_A", seq({
token(seq({ pattern("."), choice({ str("a"), str("b") }) })),
i_sym(0) }) }
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(result.first.aux_rules, IsEmpty())
AssertThat(result.second.rules, IsEmpty())
AssertThat(result.second.aux_rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "(seq /./ (choice 'a' 'b'))", token(seq({ pattern("."), choice({ str("a"), str("b") }) })) },
})));
});
it("does not extract blanks", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
{ "rule_A", choice({ i_sym(0), blank() }) },
},
set<Symbol>(),
set<char>()
});
it("does not move blank rules", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", choice({ i_sym(0), blank() }) },
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", choice({ i_sym(0), blank() }) },
})));
AssertThat(result.first.aux_rules, IsEmpty())
AssertThat(result.second.rules, IsEmpty())
AssertThat(result.second.aux_rules, IsEmpty())
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, IsEmpty())
});
it("does not create duplicate tokens in the lexical grammar", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
{ "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
},
set<Symbol>(),
set<char>()
});
auto result = extract_tokens(Grammar({
{ "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
})));
AssertThat(result.first.aux_rules, IsEmpty())
AssertThat(result.second.rules, IsEmpty())
AssertThat(result.second.aux_rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty())
AssertThat(get<1>(result).rules, IsEmpty())
AssertThat(get<1>(result).aux_rules, Equals(rule_list({
{ "'ab'", str("ab") },
})))
});
it("preserves the separator characters in the lexical grammar", [&]() {
pair<SyntaxGrammar, LexicalGrammar> result = extract_tokens(InternedGrammar{
{
{ "rule_A", str("ab") },
},
set<Symbol>(),
{ 'x', 'y', 'z' }
});
AssertThat(result.second.separators, Equals(set<char>({ 'x', 'y', 'z' })));
});
describe("when an entire rule can be extracted", [&]() {
it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() {
auto result = extract_tokens(InternedGrammar{
{
{ "rule_A", i_sym(1) },
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
},
set<Symbol>(),
set<char>()
});
auto result = extract_tokens(Grammar({
{ "rule_A", i_sym(1) },
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_A", i_token(0) }
})));
AssertThat(result.first.aux_rules, IsEmpty());
AssertThat(result.second.rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty());
AssertThat(get<1>(result).rules, Equals(rule_list({
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
})));
AssertThat(result.second.aux_rules, IsEmpty());
AssertThat(get<1>(result).aux_rules, IsEmpty());
});
it("updates symbols whose indices need to change due to deleted rules", [&]() {
auto result = extract_tokens(InternedGrammar{
{
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
},
set<Symbol>(),
set<char>()
});
auto result = extract_tokens(Grammar({
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
}));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(get<0>(result).rules, Equals(rule_list({
{ "rule_B", i_token(0) },
{ "rule_C", i_sym(0) },
})));
AssertThat(result.first.aux_rules, IsEmpty());
AssertThat(result.second.rules, Equals(rule_list({
AssertThat(get<0>(result).aux_rules, IsEmpty());
AssertThat(get<1>(result).rules, Equals(rule_list({
{ "rule_A", str("ab") },
})));
AssertThat(result.second.aux_rules, IsEmpty());
AssertThat(get<1>(result).aux_rules, IsEmpty());
});
});
describe("handling ubiquitous tokens!", [&]() {
describe("ubiquitous tokens that are not symbols", [&]() {
it("adds them to the lexical grammar's separators", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", str("x") },
}).ubiquitous_tokens({
pattern("\\s+"),
str("y"),
}));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<1>(result).separators, Equals(rule_vector({
pattern("\\s+"),
str("y"),
})));
AssertThat(get<0>(result).ubiquitous_tokens, IsEmpty());
});
});
it("updates the grammar's ubiquitous_tokens", [&]() {
auto result = extract_tokens(InternedGrammar{
{
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
},
{ Symbol(0) },
set<char>()
});
describe("ubiquitous tokens that point to moved rules", [&]() {
it("updates them according to the new symbol numbers", [&]() {
auto result = extract_tokens(Grammar( {
{ "rule_A", seq({ str("w"), i_sym(1) }) },
{ "rule_B", str("x") },
{ "rule_C", str("y") },
}).ubiquitous_tokens({
i_sym(2),
}));
AssertThat(result.first.ubiquitous_tokens, Equals(set<Symbol>({
{ Symbol(0, SymbolOptionToken) }
})));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
{ Symbol(1, SymbolOptionToken) },
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
});
describe("ubiquitous tokens that are visible", [&]() {
it("preserves them in the syntactic grammar", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", str("ab") },
{ "rule_B", str("bc") },
}).ubiquitous_tokens({ i_sym(1) }));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
Symbol(1, SymbolOptionToken)
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
});
describe("ubiquitous tokens that are used in other grammar rules", [&]() {
it("preserves them in the syntactic grammar", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", seq({ i_sym(1), str("ab") }) },
{ "_rule_B", str("bc") },
}).ubiquitous_tokens({ i_sym(1) }));
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<0>(result).ubiquitous_tokens, Equals(set<Symbol>({
Symbol(0, SymbolOptionToken),
})));
AssertThat(get<1>(result).separators, IsEmpty());
});
});
describe("ubiquitous tokens that are non-token symbols", [&]() {
it("returns an error", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", seq({ str("x"), i_sym(1) }), },
{ "rule_B", seq({ str("y"), str("z") }) },
}).ubiquitous_tokens({ i_sym(1) }));
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, "Not a token: (sym 1)")));
});
});
describe("ubiquitous tokens that are non-token symbols", [&]() {
it("returns an error", [&]() {
auto result = extract_tokens(Grammar({
{ "rule_A", str("x") },
{ "rule_B", str("y") },
}).ubiquitous_tokens({ choice({ i_sym(1), blank() }) }));
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, "Not a token: (choice (sym 1) (blank))")));
});
});
});
});

View file

@ -21,7 +21,7 @@ describe("interning symbols in a grammar", []() {
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals((GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
AssertThat(result.first.rules(), Equals(rule_list({
{ "x", choice({ i_sym(1), i_sym(2) }) },
{ "y", i_sym(2) },
{ "z", str("stuff") },
@ -45,24 +45,13 @@ describe("interning symbols in a grammar", []() {
{ "x", choice({ sym("y"), sym("z") }) },
{ "y", sym("z") },
{ "z", str("stuff") }
}).ubiquitous_tokens({ "z" });
}).ubiquitous_tokens({ sym("z") });
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals((GrammarError *)nullptr));
AssertThat(result.first.ubiquitous_tokens, Equals(set<Symbol>({
Symbol(2)
})));
});
it("preserves the grammar's separator character set", [&]() {
auto grammar = Grammar({
{ "z", str("stuff") }
}).separators({ 'x', 'y' });
auto result = intern_symbols(grammar);
AssertThat(result.first.separators, Equals(set<char>({ 'x', 'y' })))
AssertThat(result.first.ubiquitous_tokens().size(), Equals<size_t>(1));
AssertThat(*result.first.ubiquitous_tokens().begin(), EqualsPointer(i_sym(2)));
});
});

View file

@ -102,7 +102,7 @@ describe("Document", [&]() {
it("updates the parse tree", [&]() {
AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals(
"(DOCUMENT (exponent "
"(variable) "
"(variable) "
"(group (sum (number) (product (variable) (number))))))"));
});

View file

@ -3,10 +3,12 @@
#include <utility>
#include <map>
#include <set>
#include <vector>
#include <unordered_map>
#include "compiler/prepared_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/seq.h"
@ -22,6 +24,8 @@ using std::map;
using std::unordered_map;
using std::set;
using std::make_shared;
using std::vector;
using std::dynamic_pointer_cast;
using rules::Symbol;
using rules::CharacterSet;
@ -101,17 +105,24 @@ class LexTableBuilder {
lex_table.state(state_id).is_token_start = true;
}
CharacterSet separator_set() const {
CharacterSet result;
for (char c : lex_grammar.separators)
result.include(c);
return result;
// TODO - remove this hack. right now, nested repeats cause
// item sets which are equivalent to appear unequal.
rules::rule_ptr separators() const {
std::vector<rules::rule_ptr> separators;
for (auto &rule : lex_grammar.separators) {
auto repeat = dynamic_pointer_cast<const rules::Repeat>(rule);
if (repeat.get())
separators.push_back(repeat->content);
else
separators.push_back(rule);
}
return rules::repeat(rules::choice(separators));
}
rules::rule_ptr after_separators(rules::rule_ptr rule) {
return rules::Seq::Build(
{ make_shared<rules::Metadata>(
make_shared<rules::Repeat>(separator_set().copy()),
separators(),
map<rules::MetadataKey, int>(
{ { rules::START_TOKEN, 1 }, { rules::PRECEDENCE, -1 }, })),
rule, });

View file

@ -137,7 +137,9 @@ class CCodeGenerator {
line("#pragma GCC diagnostic push");
line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
line();
line("static const TSParseAction ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {");
line(
"static const TSParseAction "
"ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {");
indent([&]() {
for (auto &state : parse_table.states) {

View file

@ -12,9 +12,7 @@ using rules::rule_ptr;
Grammar::Grammar(
const std::vector<std::pair<std::string, rules::rule_ptr> > &rules)
: rules_(rules),
ubiquitous_tokens_({}),
separators_({ ' ', '\r', '\t', '\n' }) {}
: rules_(rules), ubiquitous_tokens_({}) {}
bool Grammar::operator==(const Grammar &other) const {
if (other.rules_.size() != rules_.size())
@ -63,22 +61,15 @@ ostream &operator<<(ostream &stream, const GrammarError *error) {
return stream << string("#<null>");
}
const set<string> &Grammar::ubiquitous_tokens() const {
const set<rule_ptr> &Grammar::ubiquitous_tokens() const {
return ubiquitous_tokens_;
}
Grammar &Grammar::ubiquitous_tokens(const set<string> &ubiquitous_tokens) {
Grammar &Grammar::ubiquitous_tokens(const set<rule_ptr> &ubiquitous_tokens) {
ubiquitous_tokens_ = ubiquitous_tokens;
return *this;
}
const set<char> &Grammar::separators() const { return separators_; }
Grammar &Grammar::separators(const set<char> &separators) {
separators_ = separators;
return *this;
}
const vector<pair<string, rule_ptr> > &Grammar::rules() const { return rules_; }
} // namespace tree_sitter

View file

@ -35,10 +35,11 @@ class ExpandRepeats : public rules::IdentityRuleFn {
string helper_rule_name = rule_name + string("_repeat") + to_string(index);
rule_ptr repeat_symbol =
make_shared<Symbol>(offset + index, rules::SymbolOptionAuxiliary);
aux_rules.push_back({
helper_rule_name,
Seq::Build({ inner_rule, Choice::Build({ repeat_symbol, make_shared<Blank>() }) })
});
aux_rules.push_back(
{ helper_rule_name,
Seq::Build(
{ inner_rule,
Choice::Build({ repeat_symbol, make_shared<Blank>() }) }) });
return Choice::Build({ repeat_symbol, make_shared<Blank>() });
}

View file

@ -46,7 +46,8 @@ class ExpandTokens : public rules::IdentityRuleFn {
pair<LexicalGrammar, const GrammarError *> expand_tokens(
const LexicalGrammar &grammar) {
vector<pair<string, rule_ptr> > rules, aux_rules;
vector<pair<string, rule_ptr>> rules, aux_rules;
vector<rule_ptr> separators;
ExpandTokens expander;
for (auto &pair : grammar.rules) {
@ -63,7 +64,14 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
aux_rules.push_back({ pair.first, rule });
}
return { LexicalGrammar(rules, aux_rules, grammar.separators), nullptr, };
for (auto &sep : grammar.separators) {
auto rule = expander.apply(sep);
if (expander.error)
return { LexicalGrammar(), expander.error };
separators.push_back(rule);
}
return { LexicalGrammar(rules, aux_rules, separators), nullptr, };
}
} // namespace prepare_grammar

View file

@ -10,7 +10,6 @@
#include "compiler/rules/string.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/pattern.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepare_grammar/token_description.h"
#include "compiler/prepare_grammar/is_token.h"
@ -18,15 +17,27 @@ namespace tree_sitter {
namespace prepare_grammar {
using std::pair;
using std::tuple;
using std::string;
using std::map;
using std::to_string;
using std::vector;
using std::set;
using std::make_shared;
using std::dynamic_pointer_cast;
using rules::rule_ptr;
using rules::Symbol;
using rules::SymbolOptionToken;
using rules::SymbolOptionAuxToken;
class UsedSymbols : public rules::IdentityRuleFn {
set<Symbol> used_symbols_;
rules::rule_ptr apply(rules::Symbol *sym) {
used_symbols_.insert(*sym);
return sym->copy();
}
};
class SymbolInliner : public rules::IdentityRuleFn {
map<Symbol, Symbol> replacements;
@ -59,8 +70,6 @@ class SymbolInliner : public rules::IdentityRuleFn {
};
class TokenExtractor : public rules::IdentityRuleFn {
const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(
rules::SymbolOptionToken | rules::SymbolOptionAuxiliary);
rule_ptr apply_to_token(const rules::Rule *input) {
auto rule = input->copy();
@ -91,23 +100,28 @@ class TokenExtractor : public rules::IdentityRuleFn {
}
public:
vector<pair<string, rule_ptr> > tokens;
vector<pair<string, rule_ptr>> tokens;
};
pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
const InternedGrammar &input_grammar) {
vector<pair<string, rule_ptr> > rules, tokens, aux_rules, aux_tokens;
set<Symbol> ubiquitous_tokens;
static const GrammarError *ubiq_token_err(const string &msg) {
return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, msg);
}
TokenExtractor extractor;
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const Grammar &grammar) {
vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
vector<rule_ptr> separators;
set<Symbol> ubiquitous_tokens;
map<Symbol, Symbol> symbol_replacements;
for (size_t i = 0; i < input_grammar.rules.size(); i++) {
auto pair = input_grammar.rules[i];
TokenExtractor extractor;
for (size_t i = 0; i < grammar.rules().size(); i++) {
auto pair = grammar.rules()[i];
if (is_token(pair.second)) {
tokens.push_back(pair);
symbol_replacements.insert(
{ Symbol(i), Symbol(tokens.size() - 1, rules::SymbolOptionToken) });
{ Symbol(i), Symbol(tokens.size() - 1, SymbolOptionToken) });
} else {
rules.push_back({ pair.first, extractor.apply(pair.second) });
}
@ -119,11 +133,27 @@ pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
SymbolInliner inliner(symbol_replacements);
for (auto &pair : rules)
pair.second = inliner.apply(pair.second);
for (auto &symbol : input_grammar.ubiquitous_tokens)
ubiquitous_tokens.insert(inliner.replace_symbol(symbol));
for (auto rule : grammar.ubiquitous_tokens()) {
if (is_token(rule)) {
separators.push_back(rule);
} else {
auto sym = dynamic_pointer_cast<const Symbol>(extractor.apply(rule));
if (!sym.get())
return { SyntaxGrammar(), LexicalGrammar(),
ubiq_token_err("Not a token: " + rule->to_string()) };
Symbol symbol = inliner.replace_symbol(*sym);
if (!symbol.is_token())
return { SyntaxGrammar(), LexicalGrammar(),
ubiq_token_err("Not a token: " + symbol.to_string()) };
ubiquitous_tokens.insert(symbol);
}
}
return { SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),
LexicalGrammar(tokens, aux_tokens, input_grammar.separators), };
LexicalGrammar(tokens, aux_tokens, separators), nullptr };
}
} // namespace prepare_grammar

View file

@ -2,17 +2,18 @@
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_
#include <utility>
#include "compiler/prepare_grammar/interned_grammar.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class Grammar;
class SyntaxGrammar;
class LexicalGrammar;
namespace prepare_grammar {
std::pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
const InternedGrammar &);
std::tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const Grammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -3,7 +3,6 @@
#include <vector>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/named_symbol.h"
@ -42,15 +41,13 @@ class InternSymbols : public rules::IdentityRuleFn {
string missing_rule_name;
};
pair<InternedGrammar, const GrammarError *> missing_rule_error(
string rule_name) {
InternedGrammar grammar;
return { grammar, new GrammarError(GrammarErrorTypeUndefinedSymbol,
"Undefined rule '" + rule_name + "'") };
pair<Grammar, const GrammarError *> missing_rule_error(string rule_name) {
return { Grammar({}),
new GrammarError(GrammarErrorTypeUndefinedSymbol,
"Undefined rule '" + rule_name + "'") };
}
pair<InternedGrammar, const GrammarError *> intern_symbols(
const Grammar &grammar) {
pair<Grammar, const GrammarError *> intern_symbols(const Grammar &grammar) {
InternSymbols interner(grammar);
vector<pair<string, rule_ptr> > rules;
@ -61,20 +58,15 @@ pair<InternedGrammar, const GrammarError *> intern_symbols(
rules.push_back({ pair.first, new_rule });
}
set<rules::Symbol> ubiquitous_tokens;
for (auto &name : grammar.ubiquitous_tokens()) {
auto token = interner.symbol_for_rule_name(name);
if (!token.get())
return missing_rule_error(name);
ubiquitous_tokens.insert(*token);
set<rules::rule_ptr> ubiquitous_tokens;
for (auto &rule : grammar.ubiquitous_tokens()) {
auto new_rule = interner.apply(rule);
if (!interner.missing_rule_name.empty())
return missing_rule_error(interner.missing_rule_name);
ubiquitous_tokens.insert(new_rule);
}
InternedGrammar result;
result.rules = rules;
result.ubiquitous_tokens = ubiquitous_tokens;
result.separators = grammar.separators();
return { result, nullptr };
return { Grammar(rules).ubiquitous_tokens(ubiquitous_tokens), nullptr };
}
} // namespace prepare_grammar

View file

@ -4,7 +4,6 @@
#include <utility>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/prepare_grammar/interned_grammar.h"
namespace tree_sitter {
@ -12,8 +11,7 @@ class Grammar;
namespace prepare_grammar {
std::pair<InternedGrammar, const GrammarError *> intern_symbols(
const Grammar &);
std::pair<Grammar, const GrammarError *> intern_symbols(const Grammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -1,24 +0,0 @@
#ifndef COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_
#define COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_
#include <utility>
#include <vector>
#include <set>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
namespace prepare_grammar {
class InternedGrammar {
public:
std::vector<std::pair<std::string, rules::rule_ptr> > rules;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<char> separators;
};
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_

View file

@ -16,9 +16,7 @@ class IsToken : public rules::RuleFn<bool> {
}
};
bool is_token(const rules::rule_ptr &rule) {
return IsToken().apply(rule);
}
bool is_token(const rules::rule_ptr &rule) { return IsToken().apply(rule); }
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -12,4 +12,3 @@ bool is_token(const rules::rule_ptr &);
} // namespace tree_sitter
#endif // COMPILER_PREPARE_GRAMMAR_IS_TOKEN_H_

View file

@ -182,11 +182,8 @@ class PatternParser {
case 'd':
return CharacterSet().include('0', '9');
case 's':
return CharacterSet()
.include(' ')
.include('\t')
.include('\n')
.include('\r');
return CharacterSet().include(' ').include('\t').include('\n').include(
'\r');
case 't':
return CharacterSet().include('\t');
case 'n':

View file

@ -2,7 +2,6 @@
#include "compiler/prepare_grammar/expand_tokens.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/prepared_grammar.h"
@ -10,23 +9,26 @@ namespace tree_sitter {
namespace prepare_grammar {
using std::tuple;
using std::get;
using std::make_tuple;
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
const Grammar &input_grammar) {
auto result = intern_symbols(input_grammar);
const InternedGrammar &grammar = result.first;
const Grammar &grammar = result.first;
const GrammarError *error = result.second;
if (error)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
auto grammars = extract_tokens(grammar);
const SyntaxGrammar &rule_grammar = expand_repeats(grammars.first);
auto expand_tokens_result = expand_tokens(grammars.second);
const SyntaxGrammar &rule_grammar = expand_repeats(get<0>(grammars));
error = get<2>(grammars);
if (error)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
auto expand_tokens_result = expand_tokens(get<1>(grammars));
const LexicalGrammar &lex_grammar = expand_tokens_result.first;
error = expand_tokens_result.second;
if (error)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);

View file

@ -50,7 +50,7 @@ SyntaxGrammar::SyntaxGrammar(
LexicalGrammar::LexicalGrammar(
const vector<pair<string, rules::rule_ptr> > &rules,
const vector<pair<string, rules::rule_ptr> > &aux_rules,
const set<char> &separators)
const vector<rules::rule_ptr> &separators)
: PreparedGrammar(rules, aux_rules), separators(separators) {}
} // namespace tree_sitter

View file

@ -47,9 +47,9 @@ class LexicalGrammar : public PreparedGrammar {
LexicalGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr> > &rules,
const std::vector<std::pair<std::string, rules::rule_ptr> > &aux_rules,
const std::set<char> &separators);
const std::vector<rules::rule_ptr> &separators);
std::set<char> separators;
std::vector<rules::rule_ptr> separators;
};
} // namespace tree_sitter

View file

@ -10,6 +10,9 @@ using std::string;
using std::to_string;
using std::hash;
SymbolOption SymbolOptionAuxToken =
SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary);
Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {}
Symbol::Symbol(int index, SymbolOption options)

View file

@ -12,6 +12,8 @@ typedef enum {
SymbolOptionAuxiliary = 1 << 1,
} SymbolOption;
extern SymbolOption SymbolOptionAuxToken;
class Symbol : public Rule {
public:
explicit Symbol(int index);