Unify ubiquitous tokens and lexical separators in API
This commit is contained in:
parent
a46f9d950c
commit
1ff7cedf40
29 changed files with 341 additions and 267 deletions
|
|
@ -3,10 +3,12 @@
|
|||
#include <utility>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/built_in_symbols.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/repeat.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
|
|
@ -22,6 +24,8 @@ using std::map;
|
|||
using std::unordered_map;
|
||||
using std::set;
|
||||
using std::make_shared;
|
||||
using std::vector;
|
||||
using std::dynamic_pointer_cast;
|
||||
using rules::Symbol;
|
||||
using rules::CharacterSet;
|
||||
|
||||
|
|
@ -101,17 +105,24 @@ class LexTableBuilder {
|
|||
lex_table.state(state_id).is_token_start = true;
|
||||
}
|
||||
|
||||
CharacterSet separator_set() const {
|
||||
CharacterSet result;
|
||||
for (char c : lex_grammar.separators)
|
||||
result.include(c);
|
||||
return result;
|
||||
// TODO - remove this hack. right now, nested repeats cause
|
||||
// item sets which are equivalent to appear unequal.
|
||||
rules::rule_ptr separators() const {
|
||||
std::vector<rules::rule_ptr> separators;
|
||||
for (auto &rule : lex_grammar.separators) {
|
||||
auto repeat = dynamic_pointer_cast<const rules::Repeat>(rule);
|
||||
if (repeat.get())
|
||||
separators.push_back(repeat->content);
|
||||
else
|
||||
separators.push_back(rule);
|
||||
}
|
||||
return rules::repeat(rules::choice(separators));
|
||||
}
|
||||
|
||||
rules::rule_ptr after_separators(rules::rule_ptr rule) {
|
||||
return rules::Seq::Build(
|
||||
{ make_shared<rules::Metadata>(
|
||||
make_shared<rules::Repeat>(separator_set().copy()),
|
||||
separators(),
|
||||
map<rules::MetadataKey, int>(
|
||||
{ { rules::START_TOKEN, 1 }, { rules::PRECEDENCE, -1 }, })),
|
||||
rule, });
|
||||
|
|
|
|||
|
|
@ -137,7 +137,9 @@ class CCodeGenerator {
|
|||
line("#pragma GCC diagnostic push");
|
||||
line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
|
||||
line();
|
||||
line("static const TSParseAction ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {");
|
||||
line(
|
||||
"static const TSParseAction "
|
||||
"ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {");
|
||||
|
||||
indent([&]() {
|
||||
for (auto &state : parse_table.states) {
|
||||
|
|
|
|||
|
|
@ -12,9 +12,7 @@ using rules::rule_ptr;
|
|||
|
||||
Grammar::Grammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr> > &rules)
|
||||
: rules_(rules),
|
||||
ubiquitous_tokens_({}),
|
||||
separators_({ ' ', '\r', '\t', '\n' }) {}
|
||||
: rules_(rules), ubiquitous_tokens_({}) {}
|
||||
|
||||
bool Grammar::operator==(const Grammar &other) const {
|
||||
if (other.rules_.size() != rules_.size())
|
||||
|
|
@ -63,22 +61,15 @@ ostream &operator<<(ostream &stream, const GrammarError *error) {
|
|||
return stream << string("#<null>");
|
||||
}
|
||||
|
||||
const set<string> &Grammar::ubiquitous_tokens() const {
|
||||
const set<rule_ptr> &Grammar::ubiquitous_tokens() const {
|
||||
return ubiquitous_tokens_;
|
||||
}
|
||||
|
||||
Grammar &Grammar::ubiquitous_tokens(const set<string> &ubiquitous_tokens) {
|
||||
Grammar &Grammar::ubiquitous_tokens(const set<rule_ptr> &ubiquitous_tokens) {
|
||||
ubiquitous_tokens_ = ubiquitous_tokens;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const set<char> &Grammar::separators() const { return separators_; }
|
||||
|
||||
Grammar &Grammar::separators(const set<char> &separators) {
|
||||
separators_ = separators;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const vector<pair<string, rule_ptr> > &Grammar::rules() const { return rules_; }
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -35,10 +35,11 @@ class ExpandRepeats : public rules::IdentityRuleFn {
|
|||
string helper_rule_name = rule_name + string("_repeat") + to_string(index);
|
||||
rule_ptr repeat_symbol =
|
||||
make_shared<Symbol>(offset + index, rules::SymbolOptionAuxiliary);
|
||||
aux_rules.push_back({
|
||||
helper_rule_name,
|
||||
Seq::Build({ inner_rule, Choice::Build({ repeat_symbol, make_shared<Blank>() }) })
|
||||
});
|
||||
aux_rules.push_back(
|
||||
{ helper_rule_name,
|
||||
Seq::Build(
|
||||
{ inner_rule,
|
||||
Choice::Build({ repeat_symbol, make_shared<Blank>() }) }) });
|
||||
return Choice::Build({ repeat_symbol, make_shared<Blank>() });
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@ class ExpandTokens : public rules::IdentityRuleFn {
|
|||
|
||||
pair<LexicalGrammar, const GrammarError *> expand_tokens(
|
||||
const LexicalGrammar &grammar) {
|
||||
vector<pair<string, rule_ptr> > rules, aux_rules;
|
||||
vector<pair<string, rule_ptr>> rules, aux_rules;
|
||||
vector<rule_ptr> separators;
|
||||
ExpandTokens expander;
|
||||
|
||||
for (auto &pair : grammar.rules) {
|
||||
|
|
@ -63,7 +64,14 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
|
|||
aux_rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
return { LexicalGrammar(rules, aux_rules, grammar.separators), nullptr, };
|
||||
for (auto &sep : grammar.separators) {
|
||||
auto rule = expander.apply(sep);
|
||||
if (expander.error)
|
||||
return { LexicalGrammar(), expander.error };
|
||||
separators.push_back(rule);
|
||||
}
|
||||
|
||||
return { LexicalGrammar(rules, aux_rules, separators), nullptr, };
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@
|
|||
#include "compiler/rules/string.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
#include "compiler/rules/pattern.h"
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
#include "compiler/prepare_grammar/token_description.h"
|
||||
#include "compiler/prepare_grammar/is_token.h"
|
||||
|
||||
|
|
@ -18,15 +17,27 @@ namespace tree_sitter {
|
|||
namespace prepare_grammar {
|
||||
|
||||
using std::pair;
|
||||
using std::tuple;
|
||||
using std::string;
|
||||
using std::map;
|
||||
using std::to_string;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using std::make_shared;
|
||||
using std::dynamic_pointer_cast;
|
||||
using rules::rule_ptr;
|
||||
using rules::Symbol;
|
||||
using rules::SymbolOptionToken;
|
||||
using rules::SymbolOptionAuxToken;
|
||||
|
||||
class UsedSymbols : public rules::IdentityRuleFn {
|
||||
set<Symbol> used_symbols_;
|
||||
|
||||
rules::rule_ptr apply(rules::Symbol *sym) {
|
||||
used_symbols_.insert(*sym);
|
||||
return sym->copy();
|
||||
}
|
||||
};
|
||||
|
||||
class SymbolInliner : public rules::IdentityRuleFn {
|
||||
map<Symbol, Symbol> replacements;
|
||||
|
|
@ -59,8 +70,6 @@ class SymbolInliner : public rules::IdentityRuleFn {
|
|||
};
|
||||
|
||||
class TokenExtractor : public rules::IdentityRuleFn {
|
||||
const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(
|
||||
rules::SymbolOptionToken | rules::SymbolOptionAuxiliary);
|
||||
|
||||
rule_ptr apply_to_token(const rules::Rule *input) {
|
||||
auto rule = input->copy();
|
||||
|
|
@ -91,23 +100,28 @@ class TokenExtractor : public rules::IdentityRuleFn {
|
|||
}
|
||||
|
||||
public:
|
||||
vector<pair<string, rule_ptr> > tokens;
|
||||
vector<pair<string, rule_ptr>> tokens;
|
||||
};
|
||||
|
||||
pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
|
||||
const InternedGrammar &input_grammar) {
|
||||
vector<pair<string, rule_ptr> > rules, tokens, aux_rules, aux_tokens;
|
||||
set<Symbol> ubiquitous_tokens;
|
||||
static const GrammarError *ubiq_token_err(const string &msg) {
|
||||
return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, msg);
|
||||
}
|
||||
|
||||
TokenExtractor extractor;
|
||||
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
|
||||
const Grammar &grammar) {
|
||||
vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
|
||||
vector<rule_ptr> separators;
|
||||
set<Symbol> ubiquitous_tokens;
|
||||
map<Symbol, Symbol> symbol_replacements;
|
||||
|
||||
for (size_t i = 0; i < input_grammar.rules.size(); i++) {
|
||||
auto pair = input_grammar.rules[i];
|
||||
TokenExtractor extractor;
|
||||
|
||||
for (size_t i = 0; i < grammar.rules().size(); i++) {
|
||||
auto pair = grammar.rules()[i];
|
||||
if (is_token(pair.second)) {
|
||||
tokens.push_back(pair);
|
||||
symbol_replacements.insert(
|
||||
{ Symbol(i), Symbol(tokens.size() - 1, rules::SymbolOptionToken) });
|
||||
{ Symbol(i), Symbol(tokens.size() - 1, SymbolOptionToken) });
|
||||
} else {
|
||||
rules.push_back({ pair.first, extractor.apply(pair.second) });
|
||||
}
|
||||
|
|
@ -119,11 +133,27 @@ pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
|
|||
SymbolInliner inliner(symbol_replacements);
|
||||
for (auto &pair : rules)
|
||||
pair.second = inliner.apply(pair.second);
|
||||
for (auto &symbol : input_grammar.ubiquitous_tokens)
|
||||
ubiquitous_tokens.insert(inliner.replace_symbol(symbol));
|
||||
|
||||
for (auto rule : grammar.ubiquitous_tokens()) {
|
||||
if (is_token(rule)) {
|
||||
separators.push_back(rule);
|
||||
} else {
|
||||
auto sym = dynamic_pointer_cast<const Symbol>(extractor.apply(rule));
|
||||
if (!sym.get())
|
||||
return { SyntaxGrammar(), LexicalGrammar(),
|
||||
ubiq_token_err("Not a token: " + rule->to_string()) };
|
||||
|
||||
Symbol symbol = inliner.replace_symbol(*sym);
|
||||
if (!symbol.is_token())
|
||||
return { SyntaxGrammar(), LexicalGrammar(),
|
||||
ubiq_token_err("Not a token: " + symbol.to_string()) };
|
||||
|
||||
ubiquitous_tokens.insert(symbol);
|
||||
}
|
||||
}
|
||||
|
||||
return { SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),
|
||||
LexicalGrammar(tokens, aux_tokens, input_grammar.separators), };
|
||||
LexicalGrammar(tokens, aux_tokens, separators), nullptr };
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -2,17 +2,18 @@
|
|||
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_
|
||||
|
||||
#include <utility>
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
#include "tree_sitter/compiler.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
class Grammar;
|
||||
class SyntaxGrammar;
|
||||
class LexicalGrammar;
|
||||
|
||||
namespace prepare_grammar {
|
||||
|
||||
std::pair<SyntaxGrammar, LexicalGrammar> extract_tokens(
|
||||
const InternedGrammar &);
|
||||
std::tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
|
||||
const Grammar &);
|
||||
|
||||
} // namespace prepare_grammar
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
#include <vector>
|
||||
#include <set>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/named_symbol.h"
|
||||
|
|
@ -42,15 +41,13 @@ class InternSymbols : public rules::IdentityRuleFn {
|
|||
string missing_rule_name;
|
||||
};
|
||||
|
||||
pair<InternedGrammar, const GrammarError *> missing_rule_error(
|
||||
string rule_name) {
|
||||
InternedGrammar grammar;
|
||||
return { grammar, new GrammarError(GrammarErrorTypeUndefinedSymbol,
|
||||
"Undefined rule '" + rule_name + "'") };
|
||||
pair<Grammar, const GrammarError *> missing_rule_error(string rule_name) {
|
||||
return { Grammar({}),
|
||||
new GrammarError(GrammarErrorTypeUndefinedSymbol,
|
||||
"Undefined rule '" + rule_name + "'") };
|
||||
}
|
||||
|
||||
pair<InternedGrammar, const GrammarError *> intern_symbols(
|
||||
const Grammar &grammar) {
|
||||
pair<Grammar, const GrammarError *> intern_symbols(const Grammar &grammar) {
|
||||
InternSymbols interner(grammar);
|
||||
vector<pair<string, rule_ptr> > rules;
|
||||
|
||||
|
|
@ -61,20 +58,15 @@ pair<InternedGrammar, const GrammarError *> intern_symbols(
|
|||
rules.push_back({ pair.first, new_rule });
|
||||
}
|
||||
|
||||
set<rules::Symbol> ubiquitous_tokens;
|
||||
for (auto &name : grammar.ubiquitous_tokens()) {
|
||||
auto token = interner.symbol_for_rule_name(name);
|
||||
if (!token.get())
|
||||
return missing_rule_error(name);
|
||||
ubiquitous_tokens.insert(*token);
|
||||
set<rules::rule_ptr> ubiquitous_tokens;
|
||||
for (auto &rule : grammar.ubiquitous_tokens()) {
|
||||
auto new_rule = interner.apply(rule);
|
||||
if (!interner.missing_rule_name.empty())
|
||||
return missing_rule_error(interner.missing_rule_name);
|
||||
ubiquitous_tokens.insert(new_rule);
|
||||
}
|
||||
|
||||
InternedGrammar result;
|
||||
result.rules = rules;
|
||||
result.ubiquitous_tokens = ubiquitous_tokens;
|
||||
result.separators = grammar.separators();
|
||||
|
||||
return { result, nullptr };
|
||||
return { Grammar(rules).ubiquitous_tokens(ubiquitous_tokens), nullptr };
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@
|
|||
#include <utility>
|
||||
#include <string>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
|
|
@ -12,8 +11,7 @@ class Grammar;
|
|||
|
||||
namespace prepare_grammar {
|
||||
|
||||
std::pair<InternedGrammar, const GrammarError *> intern_symbols(
|
||||
const Grammar &);
|
||||
std::pair<Grammar, const GrammarError *> intern_symbols(const Grammar &);
|
||||
|
||||
} // namespace prepare_grammar
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
#ifndef COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_
|
||||
#define COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
|
||||
class InternedGrammar {
|
||||
public:
|
||||
std::vector<std::pair<std::string, rules::rule_ptr> > rules;
|
||||
std::set<rules::Symbol> ubiquitous_tokens;
|
||||
std::set<char> separators;
|
||||
};
|
||||
|
||||
} // namespace prepare_grammar
|
||||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_
|
||||
|
|
@ -16,9 +16,7 @@ class IsToken : public rules::RuleFn<bool> {
|
|||
}
|
||||
};
|
||||
|
||||
bool is_token(const rules::rule_ptr &rule) {
|
||||
return IsToken().apply(rule);
|
||||
}
|
||||
bool is_token(const rules::rule_ptr &rule) { return IsToken().apply(rule); }
|
||||
|
||||
} // namespace prepare_grammar
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -12,4 +12,3 @@ bool is_token(const rules::rule_ptr &);
|
|||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_PREPARE_GRAMMAR_IS_TOKEN_H_
|
||||
|
||||
|
|
|
|||
|
|
@ -182,11 +182,8 @@ class PatternParser {
|
|||
case 'd':
|
||||
return CharacterSet().include('0', '9');
|
||||
case 's':
|
||||
return CharacterSet()
|
||||
.include(' ')
|
||||
.include('\t')
|
||||
.include('\n')
|
||||
.include('\r');
|
||||
return CharacterSet().include(' ').include('\t').include('\n').include(
|
||||
'\r');
|
||||
case 't':
|
||||
return CharacterSet().include('\t');
|
||||
case 'n':
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#include "compiler/prepare_grammar/expand_tokens.h"
|
||||
#include "compiler/prepare_grammar/extract_tokens.h"
|
||||
#include "compiler/prepare_grammar/intern_symbols.h"
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
#include "compiler/prepare_grammar/prepare_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
|
|
@ -10,23 +9,26 @@ namespace tree_sitter {
|
|||
namespace prepare_grammar {
|
||||
|
||||
using std::tuple;
|
||||
using std::get;
|
||||
using std::make_tuple;
|
||||
|
||||
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
|
||||
const Grammar &input_grammar) {
|
||||
auto result = intern_symbols(input_grammar);
|
||||
const InternedGrammar &grammar = result.first;
|
||||
const Grammar &grammar = result.first;
|
||||
const GrammarError *error = result.second;
|
||||
|
||||
if (error)
|
||||
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
|
||||
|
||||
auto grammars = extract_tokens(grammar);
|
||||
const SyntaxGrammar &rule_grammar = expand_repeats(grammars.first);
|
||||
auto expand_tokens_result = expand_tokens(grammars.second);
|
||||
const SyntaxGrammar &rule_grammar = expand_repeats(get<0>(grammars));
|
||||
error = get<2>(grammars);
|
||||
if (error)
|
||||
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
|
||||
|
||||
auto expand_tokens_result = expand_tokens(get<1>(grammars));
|
||||
const LexicalGrammar &lex_grammar = expand_tokens_result.first;
|
||||
error = expand_tokens_result.second;
|
||||
|
||||
if (error)
|
||||
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ SyntaxGrammar::SyntaxGrammar(
|
|||
LexicalGrammar::LexicalGrammar(
|
||||
const vector<pair<string, rules::rule_ptr> > &rules,
|
||||
const vector<pair<string, rules::rule_ptr> > &aux_rules,
|
||||
const set<char> &separators)
|
||||
const vector<rules::rule_ptr> &separators)
|
||||
: PreparedGrammar(rules, aux_rules), separators(separators) {}
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -47,9 +47,9 @@ class LexicalGrammar : public PreparedGrammar {
|
|||
LexicalGrammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr> > &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr> > &aux_rules,
|
||||
const std::set<char> &separators);
|
||||
const std::vector<rules::rule_ptr> &separators);
|
||||
|
||||
std::set<char> separators;
|
||||
std::vector<rules::rule_ptr> separators;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ using std::string;
|
|||
using std::to_string;
|
||||
using std::hash;
|
||||
|
||||
SymbolOption SymbolOptionAuxToken =
|
||||
SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary);
|
||||
|
||||
Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {}
|
||||
|
||||
Symbol::Symbol(int index, SymbolOption options)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ typedef enum {
|
|||
SymbolOptionAuxiliary = 1 << 1,
|
||||
} SymbolOption;
|
||||
|
||||
extern SymbolOption SymbolOptionAuxToken;
|
||||
|
||||
class Symbol : public Rule {
|
||||
public:
|
||||
explicit Symbol(int index);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue