In compiler, distinguish between anonymous tokens and hidden rules

This commit is contained in:
Max Brunsfeld 2015-09-05 17:05:37 -07:00
parent 4b270c8604
commit 5982b77c97
46 changed files with 41131 additions and 40884 deletions

View file

@ -10,7 +10,7 @@
#include "compiler/build_tables/get_metadata.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
@ -63,9 +63,9 @@ class LexTableBuilder {
result.insert(
LexItem(symbol, after_separators(CharacterSet().include(0).copy())));
else if (symbol.is_token())
result.insert(
LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
else if (symbol.is_token)
result.insert(LexItem(
symbol, after_separators(lex_grammar.rules[symbol.index].rule)));
}
return result;
}

View file

@ -11,8 +11,7 @@
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/get_completion_status.h"
#include "compiler/build_tables/get_metadata.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
@ -48,9 +47,8 @@ class ParseTableBuilder {
conflict_manager(grammar) {}
pair<ParseTable, const GrammarError *> build() {
auto start_symbol = grammar.rules.empty()
? make_shared<Symbol>(0, rules::SymbolOptionToken)
: make_shared<Symbol>(0);
auto start_symbol = grammar.rules.empty() ? make_shared<Symbol>(0, true)
: make_shared<Symbol>(0);
ParseItem start_item(rules::START(), start_symbol, {});
add_parse_state(
item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
@ -260,10 +258,10 @@ class ParseTableBuilder {
return "END_OF_INPUT";
else
return "";
} else if (symbol.is_token())
return lexical_grammar.rule_name(symbol);
} else if (symbol.is_token)
return lexical_grammar.rules[symbol.index].name;
else
return grammar.rule_name(symbol);
return grammar.rules[symbol.index].name;
}
string action_description(const ParseAction &action) const {

View file

@ -1,7 +1,6 @@
#include "compiler/build_tables/build_lex_table.h"
#include "compiler/build_tables/build_parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -1,6 +1,6 @@
#include "compiler/build_tables/first_symbols.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/seq.h"
@ -28,8 +28,8 @@ class FirstSymbols : public rules::RuleFn<set<Symbol>> {
return set<Symbol>();
set<Symbol> result({ *rule });
if (!rule->is_token()) {
set<Symbol> &&symbols = apply(grammar->rule(*rule));
if (!rule->is_token) {
set<Symbol> &&symbols = apply(grammar->rules[rule->index].rule);
result.insert(symbols.begin(), symbols.end());
}

View file

@ -7,7 +7,7 @@
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/build_tables/item.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {
namespace build_tables {
@ -41,7 +41,7 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
const Symbol &symbol = pair.first;
const rule_ptr &next_rule = pair.second;
if (symbol.is_token() || symbol.is_built_in())
if (symbol.is_token || symbol.is_built_in())
continue;
set<Symbol> next_lookahead_symbols = first_symbols(next_rule, grammar);
@ -49,8 +49,9 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
next_lookahead_symbols.insert(lookahead_symbols.begin(),
lookahead_symbols.end());
items_to_process.push_back({ ParseItem(symbol, grammar.rule(symbol), {}),
next_lookahead_symbols });
items_to_process.push_back(
{ ParseItem(symbol, grammar.rules[symbol.index].rule, {}),
next_lookahead_symbols });
}
}

View file

@ -4,7 +4,7 @@
#include "compiler/build_tables/merge_transitions.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {

View file

@ -2,7 +2,7 @@
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#include "tree_sitter/compiler.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {

View file

@ -3,8 +3,7 @@
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/build_tables/parse_item.h"
namespace tree_sitter {

View file

@ -1,7 +1,7 @@
#include "compiler/build_tables/rule_can_be_blank.h"
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
@ -55,7 +55,7 @@ class CanBeBlankRecursive : public CanBeBlank {
bool apply_to(const rules::Symbol *rule) {
if (visited_symbols.find(*rule) == visited_symbols.end()) {
visited_symbols.insert(*rule);
return !rule->is_token() && apply(grammar->rule(*rule));
return !rule->is_token && apply(grammar->rules[rule->index].rule);
} else {
return false;
}

View file

@ -2,8 +2,7 @@
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/build_tables/build_tables.h"
#include "compiler/generate_code/c_code.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {

View file

@ -7,8 +7,7 @@
#include "compiler/generate_code/c_code.h"
#include "compiler/lex_table.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/util/string_helpers.h"
@ -142,7 +141,7 @@ class CCodeGenerator {
indent([&]() {
for (const auto &symbol : parse_table.symbols)
if (!symbol.is_built_in() &&
(symbol.is_auxiliary() || rule_name(symbol)[0] == '_'))
(is_auxiliary(symbol) || rule_name(symbol)[0] == '_'))
line("[" + symbol_id(symbol) + "] = 1,");
});
line("};");
@ -329,7 +328,7 @@ class CCodeGenerator {
return "";
} else {
string name = sanitize_name(rule_name(symbol));
if (symbol.is_auxiliary())
if (is_auxiliary(symbol))
return "aux_sym_" + name;
else
return "sym_" + name;
@ -349,9 +348,20 @@ class CCodeGenerator {
}
}
bool is_auxiliary(const rules::Symbol &symbol) {
if (symbol.is_token) {
return lexical_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
} else {
return syntax_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
}
}
string rule_name(const rules::Symbol &symbol) {
return symbol.is_token() ? lexical_grammar.rule_name(symbol)
: syntax_grammar.rule_name(symbol);
if (symbol.is_token) {
return lexical_grammar.rules[symbol.index].name;
} else {
return syntax_grammar.rules[symbol.index].name;
}
}
bool reduce_action_is_fragile(const ParseAction &action) const {
@ -394,15 +404,14 @@ class CCodeGenerator {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') || (c == '_')) {
stripped_name += c;
continue;
}
auto replacement = REPLACEMENTS.find(c);
if (replacement != REPLACEMENTS.end()) {
if (stripped_name[stripped_name.size() - 1] != '_')
stripped_name += "_";
stripped_name += replacement->second;
continue;
} else {
auto replacement = REPLACEMENTS.find(c);
size_t i = stripped_name.size();
if (replacement != REPLACEMENTS.end()) {
if (i > 0 && stripped_name[i - 1] != '_')
stripped_name += "_";
stripped_name += replacement->second;
}
}
}

View file

@ -1,24 +0,0 @@
#include "compiler/lexical_grammar.h"
#include <vector>
#include <string>
#include <utility>
#include "compiler/rules/symbol.h"
namespace tree_sitter {
using std::string;
using std::pair;
using std::vector;
using std::set;
const rule_ptr &LexicalGrammar::rule(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
: rules[symbol.index].second;
}
const string &LexicalGrammar::rule_name(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
: rules[symbol.index].first;
}
} // namespace tree_sitter

View file

@ -1,24 +0,0 @@
#ifndef COMPILER_LEXICAL_GRAMMAR_H_
#define COMPILER_LEXICAL_GRAMMAR_H_
#include <vector>
#include <string>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
class LexicalGrammar {
public:
const std::string &rule_name(const rules::Symbol &symbol) const;
const rule_ptr &rule(const rules::Symbol &symbol) const;
std::vector<std::pair<std::string, rule_ptr>> rules;
std::vector<std::pair<std::string, rule_ptr>> aux_rules;
std::vector<rule_ptr> separators;
};
} // namespace tree_sitter
#endif // COMPILER_LEXICAL_GRAMMAR_H_

View file

@ -2,7 +2,7 @@
#include <vector>
#include <string>
#include <utility>
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
@ -40,12 +40,14 @@ class ExpandRepeats : public rules::IdentityRuleFn {
size_t index = aux_rules.size();
string helper_rule_name =
rule_name + string("_repeat") + to_string(++repeat_count);
Symbol repeat_symbol(offset + index, rules::SymbolOptionAuxiliary);
Symbol repeat_symbol(offset + index);
existing_repeats.push_back({ rule->copy(), repeat_symbol });
aux_rules.push_back(
{ helper_rule_name,
Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
make_shared<Blank>() }) }) });
aux_rules.push_back({
helper_rule_name,
Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
make_shared<Blank>() }) }),
RuleEntryTypeHidden,
});
return repeat_symbol.copy();
}
@ -62,22 +64,21 @@ class ExpandRepeats : public rules::IdentityRuleFn {
return apply(rule);
}
vector<pair<string, rule_ptr>> aux_rules;
vector<RuleEntry> aux_rules;
};
SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
SyntaxGrammar result;
result.aux_rules = grammar.aux_rules;
result.rules = grammar.rules;
result.ubiquitous_tokens = grammar.ubiquitous_tokens;
result.expected_conflicts = grammar.expected_conflicts;
ExpandRepeats expander(result.aux_rules.size());
for (auto &pair : grammar.rules)
result.rules.push_back(
{ pair.first, expander.expand(pair.second, pair.first) });
ExpandRepeats expander(result.rules.size());
for (auto &rule_entry : result.rules)
rule_entry.rule = expander.expand(rule_entry.rule, rule_entry.name);
result.aux_rules.insert(result.aux_rules.end(), expander.aux_rules.begin(),
expander.aux_rules.end());
result.rules.insert(result.rules.end(), expander.aux_rules.begin(),
expander.aux_rules.end());
return result;
}

View file

@ -2,12 +2,14 @@
#include <vector>
#include <string>
#include <utility>
#include "compiler/lexical_grammar.h"
#include <map>
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/pattern.h"
#include "compiler/rules/string.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/character_set.h"
#include "compiler/prepare_grammar/parse_regex.h"
#include "utf8proc.h"
@ -17,10 +19,12 @@ namespace prepare_grammar {
using std::string;
using std::vector;
using std::map;
using std::pair;
using std::make_shared;
using rules::String;
using rules::Pattern;
using rules::Metadata;
class ExpandTokens : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
@ -40,7 +44,11 @@ class ExpandTokens : public rules::IdentityRuleFn {
elements.push_back(rules::CharacterSet().include(el).copy());
}
return rules::Seq::build(elements);
return make_shared<rules::Metadata>(
rules::Seq::build(elements),
std::map<rules::MetadataKey, int>({
{ rules::IS_TOKEN, 1 }, { rules::PRECEDENCE, 1 },
}));
}
rule_ptr apply_to(const Pattern *rule) {
@ -60,18 +68,11 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
LexicalGrammar result;
ExpandTokens expander;
for (auto &pair : grammar.rules) {
auto rule = expander.apply(pair.second);
for (auto &entry : grammar.rules) {
auto rule = expander.apply(entry.rule);
if (expander.error)
return { result, expander.error };
result.rules.push_back({ pair.first, rule });
}
for (auto &pair : grammar.aux_rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { result, expander.error };
result.aux_rules.push_back({ pair.first, rule });
result.rules.push_back({ entry.name, rule, entry.type });
}
for (auto &sep : grammar.separators) {
@ -81,9 +82,7 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
result.separators.push_back(rule);
}
return {
result, nullptr,
};
return { result, nullptr };
}
} // namespace prepare_grammar

View file

@ -4,8 +4,7 @@
#include <set>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/string.h"
@ -21,27 +20,15 @@ using std::dynamic_pointer_cast;
using std::make_shared;
using std::make_tuple;
using std::map;
using std::pair;
using std::set;
using std::string;
using std::tuple;
using std::vector;
using rules::Symbol;
using rules::SymbolOptionToken;
using rules::SymbolOptionAuxToken;
class SymbolReplacer : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
int new_index_for_symbol(const Symbol &symbol) {
int result = symbol.index;
for (const auto &pair : replacements)
if (pair.first.index < symbol.index &&
pair.first.is_auxiliary() == symbol.is_auxiliary())
result--;
return result;
}
rule_ptr apply_to(const Symbol *rule) {
return replace_symbol(*rule).copy();
}
@ -49,54 +36,64 @@ class SymbolReplacer : public rules::IdentityRuleFn {
public:
map<Symbol, Symbol> replacements;
Symbol replace_symbol(const Symbol &rule) {
if (rule.is_built_in())
return rule;
auto replacement_pair = replacements.find(rule);
Symbol replace_symbol(const Symbol &symbol) {
if (symbol.is_built_in() || symbol.is_token)
return symbol;
auto replacement_pair = replacements.find(symbol);
if (replacement_pair != replacements.end())
return replacement_pair->second;
else
return Symbol(new_index_for_symbol(rule), rule.options);
int new_index = symbol.index;
for (const auto &pair : replacements)
if (pair.first.index < symbol.index)
new_index--;
return Symbol(new_index);
}
};
class TokenExtractor : public rules::IdentityRuleFn {
rule_ptr apply_to_token(const Rule *input) {
auto rule = input->copy();
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to_token(const Rule *input, RuleEntryType entry_type) {
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].second->operator==(*rule)) {
if (tokens[i].rule->operator==(*input)) {
token_usage_counts[i]++;
return make_shared<Symbol>(i, SymbolOptionAuxToken);
return make_shared<Symbol>(i, true);
}
rule_ptr rule = input->copy();
size_t index = tokens.size();
tokens.push_back({ token_description(rule), rule });
tokens.push_back({
token_description(rule), rule, entry_type,
});
token_usage_counts.push_back(1);
return make_shared<Symbol>(index, SymbolOptionAuxToken);
return make_shared<Symbol>(index, true);
}
rule_ptr default_apply(const Rule *rule) {
auto result = rule->copy();
if (is_token(result))
return apply_to_token(rule);
else
return result;
rule_ptr apply_to(const rules::String *rule) {
return apply_to_token(rule, RuleEntryTypeAnonymous);
}
rule_ptr apply_to(const rules::Pattern *rule) {
return apply_to_token(rule, RuleEntryTypeHidden);
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (is_token(rule->copy()))
return apply_to_token(rule);
if (rule->value_for(rules::IS_TOKEN) > 0)
return apply_to_token(rule->rule.get(), RuleEntryTypeHidden);
else
return rules::IdentityRuleFn::apply_to(rule);
}
public:
vector<size_t> token_usage_counts;
vector<pair<string, rule_ptr>> tokens;
vector<RuleEntry> tokens;
};
static const GrammarError *ubiq_token_err(const string &msg) {
static const GrammarError *ubiq_token_err(const string &message) {
return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: " + msg);
"Not a token: " + message);
}
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
@ -106,51 +103,43 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
SymbolReplacer symbol_replacer;
TokenExtractor extractor;
vector<pair<string, rule_ptr>> extracted_rules;
for (auto &pair : grammar.rules)
extracted_rules.push_back({ pair.first, extractor.apply(pair.second) });
/*
* First, extract all of the grammar's tokens into the lexical grammar.
*/
vector<RuleEntry> processed_rules;
for (const auto &pair : grammar.rules)
processed_rules.push_back({
pair.first, extractor.apply(pair.second), RuleEntryTypeNamed,
});
lexical_grammar.rules = extractor.tokens;
/*
* If a rule's entire content was extracted as a token and that token didn't
* appear within any other rule, then remove that rule from the syntax
* grammar, giving its name to the token in the lexical grammar. Any symbols
* that pointed to that rule will need to be updated to point to the rule in
* the lexical grammar. Symbols that pointed to later rules will need to have
* their indices decremented.
*/
size_t i = 0;
for (auto &pair : extracted_rules) {
auto &rule = pair.second;
auto symbol = dynamic_pointer_cast<const Symbol>(rule);
if (symbol.get() && symbol->is_auxiliary() &&
for (const RuleEntry &entry : processed_rules) {
auto symbol = dynamic_pointer_cast<const Symbol>(entry.rule);
if (symbol.get() && symbol->is_token && !symbol->is_built_in() &&
extractor.token_usage_counts[symbol->index] == 1) {
lexical_grammar.rules.push_back(
{ pair.first, extractor.tokens[symbol->index].second });
extractor.token_usage_counts[symbol->index] = 0;
symbol_replacer.replacements.insert(
{ Symbol(i),
Symbol(lexical_grammar.rules.size() - 1, SymbolOptionToken) });
lexical_grammar.rules[symbol->index].type = entry.type;
lexical_grammar.rules[symbol->index].name = entry.name;
symbol_replacer.replacements.insert({ Symbol(i), *symbol });
} else {
syntax_grammar.rules.push_back(pair);
syntax_grammar.rules.push_back(entry);
}
i++;
}
for (auto &pair : syntax_grammar.rules)
pair.second = symbol_replacer.apply(pair.second);
lexical_grammar.aux_rules = extractor.tokens;
for (auto &rule : grammar.ubiquitous_tokens) {
if (is_token(rule)) {
lexical_grammar.separators.push_back(rule);
} else {
auto sym = dynamic_pointer_cast<const Symbol>(extractor.apply(rule));
if (!sym.get())
return make_tuple(syntax_grammar, lexical_grammar,
ubiq_token_err(rule->to_string()));
Symbol symbol = symbol_replacer.replace_symbol(*sym);
if (!symbol.is_token())
return make_tuple(
syntax_grammar, lexical_grammar,
ubiq_token_err(syntax_grammar.rules[symbol.index].first));
syntax_grammar.ubiquitous_tokens.insert(symbol);
}
}
/*
* Perform any replacements of symbols needed based on the previous step.
*/
for (RuleEntry &entry : syntax_grammar.rules)
entry.rule = symbol_replacer.apply(entry.rule);
for (auto &symbol_set : grammar.expected_conflicts) {
set<Symbol> new_symbol_set;
@ -159,6 +148,33 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
syntax_grammar.expected_conflicts.insert(new_symbol_set);
}
/*
* The grammar's ubiquitous tokens can be either token rules or symbols
* pointing to token rules. If they are symbols, then they'll be handled by
* the parser; add them to the syntax grammar's ubiqutous tokens. If they
* are anonymous rules, they can be handled by the lexer; add them to the
* lexical grammar's separator rules.
*/
for (const rule_ptr &rule : grammar.ubiquitous_tokens) {
if (is_token(rule)) {
lexical_grammar.separators.push_back(rule);
continue;
}
auto symbol = dynamic_pointer_cast<const Symbol>(rule);
if (!symbol.get())
return make_tuple(syntax_grammar, lexical_grammar,
ubiq_token_err(rule->to_string()));
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
if (!new_symbol.is_token)
return make_tuple(
syntax_grammar, lexical_grammar,
ubiq_token_err(syntax_grammar.rules[new_symbol.index].name));
syntax_grammar.ubiquitous_tokens.insert(new_symbol);
}
return make_tuple(syntax_grammar, lexical_grammar, nullptr);
}

View file

@ -3,8 +3,7 @@
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {

View file

@ -2,8 +2,7 @@
#define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_
#include <utility>
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepared_grammar.h"
namespace tree_sitter {

View file

@ -5,6 +5,7 @@
#include "compiler/rules/seq.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/string.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/metadata.h"
#include "compiler/util/string_helpers.h"
@ -15,11 +16,12 @@ using std::string;
class TokenDescription : public rules::RuleFn<string> {
string apply_to(const rules::Pattern *rule) {
return "PAT_" + util::escape_string(rule->value);
is_trivial = false;
return rule->value;
}
string apply_to(const rules::String *rule) {
return "STR_" + util::escape_string(rule->value);
return rule->value;
}
string apply_to(const rules::Metadata *rule) {
@ -27,19 +29,41 @@ class TokenDescription : public rules::RuleFn<string> {
}
string apply_to(const rules::Seq *rule) {
return "(seq " + apply(rule->left) + " " + apply(rule->right) + ")";
is_trivial = false;
return apply(rule->left) + apply(rule->right);
}
string apply_to(const rules::Repeat *rule) {
is_trivial = false;
return apply(rule->content) + "*";
}
string apply_to(const rules::Choice *rule) {
string result = "(choice";
for (auto &element : rule->elements)
result += " " + apply(element);
is_trivial = false;
string result = "(";
bool started = false;
for (auto &element : rule->elements) {
if (started)
result += "|";
result += apply(element);
started = true;
}
return result + ")";
}
public:
bool is_trivial;
TokenDescription() : is_trivial(true) {}
};
std::string token_description(const rule_ptr &rule) {
return TokenDescription().apply(rule);
string token_description(const rule_ptr &rule) {
TokenDescription description;
string result = description.apply(rule);
if (description.is_trivial)
return result;
else
return "/" + result + "/";
}
} // namespace prepare_grammar

View file

@ -0,0 +1,39 @@
#ifndef COMPILER_PREPARED_GRAMMAR_H_
#define COMPILER_PREPARED_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
enum RuleEntryType {
RuleEntryTypeNamed,
RuleEntryTypeAnonymous,
RuleEntryTypeHidden,
};
struct RuleEntry {
std::string name;
rule_ptr rule;
RuleEntryType type;
};
class SyntaxGrammar {
public:
std::vector<RuleEntry> rules;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
};
class LexicalGrammar {
public:
std::vector<RuleEntry> rules;
std::vector<rule_ptr> separators;
};
} // namespace tree_sitter
#endif // COMPILER_PREPARED_GRAMMAR_H_

View file

@ -4,11 +4,11 @@ namespace tree_sitter {
namespace rules {
Symbol END_OF_INPUT() {
return Symbol(-1, SymbolOptionToken);
return Symbol(-1, true);
}
Symbol ERROR() {
return Symbol(-2, SymbolOptionToken);
return Symbol(-2, true);
}
Symbol START() {

View file

@ -52,7 +52,7 @@ rule_ptr pattern(const string &value) {
}
rule_ptr str(const string &value) {
return token(prec(1, make_shared<rules::String>(value)));
return make_shared<rules::String>(value);
}
rule_ptr err(const rule_ptr &rule) {

View file

@ -10,16 +10,12 @@ using std::string;
using std::to_string;
using std::hash;
SymbolOption SymbolOptionAuxToken =
SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary);
Symbol::Symbol(int index) : index(index), is_token(false) {}
Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {}
Symbol::Symbol(int index, SymbolOption options)
: index(index), options(options) {}
Symbol::Symbol(int index, bool is_token) : index(index), is_token(is_token) {}
bool Symbol::operator==(const Symbol &other) const {
return (other.index == index) && (other.options == options);
return (other.index == index) && (other.is_token == is_token);
}
bool Symbol::operator==(const Rule &rule) const {
@ -28,7 +24,7 @@ bool Symbol::operator==(const Rule &rule) const {
}
size_t Symbol::hash_code() const {
return hash<int>()(index) ^ hash<int16_t>()(options);
return hash<int>()(index) ^ hash<bool>()(is_token);
}
rule_ptr Symbol::copy() const {
@ -36,31 +32,22 @@ rule_ptr Symbol::copy() const {
}
string Symbol::to_string() const {
string name = (options & SymbolOptionAuxiliary) ? "aux_" : "";
name += (options & SymbolOptionToken) ? "token" : "sym";
string name = is_token ? "token" : "sym";
return "(" + name + " " + std::to_string(index) + ")";
}
bool Symbol::operator<(const Symbol &other) const {
if (options < other.options)
if (!is_token && other.is_token)
return true;
if (options > other.options)
if (is_token && !other.is_token)
return false;
return (index < other.index);
}
bool Symbol::is_token() const {
return options & SymbolOptionToken;
}
bool Symbol::is_built_in() const {
return index < 0;
}
bool Symbol::is_auxiliary() const {
return options & SymbolOptionAuxiliary;
}
void Symbol::accept(Visitor *visitor) const {
visitor->visit(this);
}

View file

@ -7,17 +7,10 @@
namespace tree_sitter {
namespace rules {
typedef enum {
SymbolOptionToken = 1 << 0,
SymbolOptionAuxiliary = 1 << 1,
} SymbolOption;
extern SymbolOption SymbolOptionAuxToken;
class Symbol : public Rule {
public:
explicit Symbol(int index);
Symbol(int index, SymbolOption options);
Symbol(int index, bool is_token);
bool operator==(const Symbol &other) const;
bool operator==(const Rule &other) const;
@ -28,12 +21,10 @@ class Symbol : public Rule {
void accept(Visitor *visitor) const;
bool operator<(const Symbol &other) const;
bool is_token() const;
bool is_built_in() const;
bool is_auxiliary() const;
int index;
SymbolOption options;
bool is_token;
};
} // namespace rules

View file

@ -1,21 +0,0 @@
#include "compiler/syntax_grammar.h"
#include <vector>
#include <string>
#include <utility>
#include "compiler/rules/symbol.h"
namespace tree_sitter {
using std::string;
const rule_ptr &SyntaxGrammar::rule(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
: rules[symbol.index].second;
}
const string &SyntaxGrammar::rule_name(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
: rules[symbol.index].first;
}
} // namespace tree_sitter

View file

@ -1,26 +0,0 @@
#ifndef COMPILER_SYNTAX_GRAMMAR_H_
#define COMPILER_SYNTAX_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
class SyntaxGrammar {
public:
const std::string &rule_name(const rules::Symbol &symbol) const;
const rule_ptr &rule(const rules::Symbol &symbol) const;
std::vector<std::pair<std::string, rule_ptr>> rules;
std::vector<std::pair<std::string, rule_ptr>> aux_rules;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
};
} // namespace tree_sitter
#endif // COMPILER_SYNTAX_GRAMMAR_H_