In compiler, distinguish between anonymous tokens and hidden rules
This commit is contained in:
parent
4b270c8604
commit
5982b77c97
46 changed files with 41131 additions and 40884 deletions
|
|
@ -10,7 +10,7 @@
|
|||
#include "compiler/build_tables/get_metadata.h"
|
||||
#include "compiler/build_tables/lex_item.h"
|
||||
#include "compiler/parse_table.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/built_in_symbols.h"
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
|
|
@ -63,9 +63,9 @@ class LexTableBuilder {
|
|||
result.insert(
|
||||
LexItem(symbol, after_separators(CharacterSet().include(0).copy())));
|
||||
|
||||
else if (symbol.is_token())
|
||||
result.insert(
|
||||
LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
|
||||
else if (symbol.is_token)
|
||||
result.insert(LexItem(
|
||||
symbol, after_separators(lex_grammar.rules[symbol.index].rule)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,8 +11,7 @@
|
|||
#include "compiler/build_tables/parse_item.h"
|
||||
#include "compiler/build_tables/get_completion_status.h"
|
||||
#include "compiler/build_tables/get_metadata.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
#include "compiler/rules/built_in_symbols.h"
|
||||
|
||||
|
|
@ -48,9 +47,8 @@ class ParseTableBuilder {
|
|||
conflict_manager(grammar) {}
|
||||
|
||||
pair<ParseTable, const GrammarError *> build() {
|
||||
auto start_symbol = grammar.rules.empty()
|
||||
? make_shared<Symbol>(0, rules::SymbolOptionToken)
|
||||
: make_shared<Symbol>(0);
|
||||
auto start_symbol = grammar.rules.empty() ? make_shared<Symbol>(0, true)
|
||||
: make_shared<Symbol>(0);
|
||||
ParseItem start_item(rules::START(), start_symbol, {});
|
||||
add_parse_state(
|
||||
item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
|
||||
|
|
@ -260,10 +258,10 @@ class ParseTableBuilder {
|
|||
return "END_OF_INPUT";
|
||||
else
|
||||
return "";
|
||||
} else if (symbol.is_token())
|
||||
return lexical_grammar.rule_name(symbol);
|
||||
} else if (symbol.is_token)
|
||||
return lexical_grammar.rules[symbol.index].name;
|
||||
else
|
||||
return grammar.rule_name(symbol);
|
||||
return grammar.rules[symbol.index].name;
|
||||
}
|
||||
|
||||
string action_description(const ParseAction &action) const {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
#include "compiler/build_tables/build_lex_table.h"
|
||||
#include "compiler/build_tables/build_parse_table.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
#include "compiler/build_tables/first_symbols.h"
|
||||
#include "compiler/build_tables/rule_can_be_blank.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
|
|
@ -28,8 +28,8 @@ class FirstSymbols : public rules::RuleFn<set<Symbol>> {
|
|||
return set<Symbol>();
|
||||
|
||||
set<Symbol> result({ *rule });
|
||||
if (!rule->is_token()) {
|
||||
set<Symbol> &&symbols = apply(grammar->rule(*rule));
|
||||
if (!rule->is_token) {
|
||||
set<Symbol> &&symbols = apply(grammar->rules[rule->index].rule);
|
||||
result.insert(symbols.begin(), symbols.end());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
#include "compiler/build_tables/rule_transitions.h"
|
||||
#include "compiler/build_tables/rule_can_be_blank.h"
|
||||
#include "compiler/build_tables/item.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
|
@ -41,7 +41,7 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
|
|||
const Symbol &symbol = pair.first;
|
||||
const rule_ptr &next_rule = pair.second;
|
||||
|
||||
if (symbol.is_token() || symbol.is_built_in())
|
||||
if (symbol.is_token || symbol.is_built_in())
|
||||
continue;
|
||||
|
||||
set<Symbol> next_lookahead_symbols = first_symbols(next_rule, grammar);
|
||||
|
|
@ -49,8 +49,9 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
|
|||
next_lookahead_symbols.insert(lookahead_symbols.begin(),
|
||||
lookahead_symbols.end());
|
||||
|
||||
items_to_process.push_back({ ParseItem(symbol, grammar.rule(symbol), {}),
|
||||
next_lookahead_symbols });
|
||||
items_to_process.push_back(
|
||||
{ ParseItem(symbol, grammar.rules[symbol.index].rule, {}),
|
||||
next_lookahead_symbols });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#include "compiler/build_tables/merge_transitions.h"
|
||||
#include "compiler/build_tables/parse_item.h"
|
||||
#include "compiler/build_tables/rule_transitions.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
|
||||
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@
|
|||
|
||||
#include <utility>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/build_tables/parse_item.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include "compiler/build_tables/rule_can_be_blank.h"
|
||||
#include <set>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
|
|
@ -55,7 +55,7 @@ class CanBeBlankRecursive : public CanBeBlank {
|
|||
bool apply_to(const rules::Symbol *rule) {
|
||||
if (visited_symbols.find(*rule) == visited_symbols.end()) {
|
||||
visited_symbols.insert(*rule);
|
||||
return !rule->is_token() && apply(grammar->rule(*rule));
|
||||
return !rule->is_token && apply(grammar->rules[rule->index].rule);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@
|
|||
#include "compiler/prepare_grammar/prepare_grammar.h"
|
||||
#include "compiler/build_tables/build_tables.h"
|
||||
#include "compiler/generate_code/c_code.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
|
|
|
|||
|
|
@ -7,8 +7,7 @@
|
|||
#include "compiler/generate_code/c_code.h"
|
||||
#include "compiler/lex_table.h"
|
||||
#include "compiler/parse_table.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/built_in_symbols.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
|
||||
|
|
@ -142,7 +141,7 @@ class CCodeGenerator {
|
|||
indent([&]() {
|
||||
for (const auto &symbol : parse_table.symbols)
|
||||
if (!symbol.is_built_in() &&
|
||||
(symbol.is_auxiliary() || rule_name(symbol)[0] == '_'))
|
||||
(is_auxiliary(symbol) || rule_name(symbol)[0] == '_'))
|
||||
line("[" + symbol_id(symbol) + "] = 1,");
|
||||
});
|
||||
line("};");
|
||||
|
|
@ -329,7 +328,7 @@ class CCodeGenerator {
|
|||
return "";
|
||||
} else {
|
||||
string name = sanitize_name(rule_name(symbol));
|
||||
if (symbol.is_auxiliary())
|
||||
if (is_auxiliary(symbol))
|
||||
return "aux_sym_" + name;
|
||||
else
|
||||
return "sym_" + name;
|
||||
|
|
@ -349,9 +348,20 @@ class CCodeGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
bool is_auxiliary(const rules::Symbol &symbol) {
|
||||
if (symbol.is_token) {
|
||||
return lexical_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
|
||||
} else {
|
||||
return syntax_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
|
||||
}
|
||||
}
|
||||
|
||||
string rule_name(const rules::Symbol &symbol) {
|
||||
return symbol.is_token() ? lexical_grammar.rule_name(symbol)
|
||||
: syntax_grammar.rule_name(symbol);
|
||||
if (symbol.is_token) {
|
||||
return lexical_grammar.rules[symbol.index].name;
|
||||
} else {
|
||||
return syntax_grammar.rules[symbol.index].name;
|
||||
}
|
||||
}
|
||||
|
||||
bool reduce_action_is_fragile(const ParseAction &action) const {
|
||||
|
|
@ -394,15 +404,14 @@ class CCodeGenerator {
|
|||
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
|
||||
('0' <= c && c <= '9') || (c == '_')) {
|
||||
stripped_name += c;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto replacement = REPLACEMENTS.find(c);
|
||||
if (replacement != REPLACEMENTS.end()) {
|
||||
if (stripped_name[stripped_name.size() - 1] != '_')
|
||||
stripped_name += "_";
|
||||
stripped_name += replacement->second;
|
||||
continue;
|
||||
} else {
|
||||
auto replacement = REPLACEMENTS.find(c);
|
||||
size_t i = stripped_name.size();
|
||||
if (replacement != REPLACEMENTS.end()) {
|
||||
if (i > 0 && stripped_name[i - 1] != '_')
|
||||
stripped_name += "_";
|
||||
stripped_name += replacement->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
#include "compiler/lexical_grammar.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
using std::string;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
|
||||
const rule_ptr &LexicalGrammar::rule(const rules::Symbol &symbol) const {
|
||||
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
|
||||
: rules[symbol.index].second;
|
||||
}
|
||||
|
||||
const string &LexicalGrammar::rule_name(const rules::Symbol &symbol) const {
|
||||
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
|
||||
: rules[symbol.index].first;
|
||||
}
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
#ifndef COMPILER_LEXICAL_GRAMMAR_H_
|
||||
#define COMPILER_LEXICAL_GRAMMAR_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
class LexicalGrammar {
|
||||
public:
|
||||
const std::string &rule_name(const rules::Symbol &symbol) const;
|
||||
const rule_ptr &rule(const rules::Symbol &symbol) const;
|
||||
|
||||
std::vector<std::pair<std::string, rule_ptr>> rules;
|
||||
std::vector<std::pair<std::string, rule_ptr>> aux_rules;
|
||||
std::vector<rule_ptr> separators;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_LEXICAL_GRAMMAR_H_
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
|
@ -40,12 +40,14 @@ class ExpandRepeats : public rules::IdentityRuleFn {
|
|||
size_t index = aux_rules.size();
|
||||
string helper_rule_name =
|
||||
rule_name + string("_repeat") + to_string(++repeat_count);
|
||||
Symbol repeat_symbol(offset + index, rules::SymbolOptionAuxiliary);
|
||||
Symbol repeat_symbol(offset + index);
|
||||
existing_repeats.push_back({ rule->copy(), repeat_symbol });
|
||||
aux_rules.push_back(
|
||||
{ helper_rule_name,
|
||||
Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
|
||||
make_shared<Blank>() }) }) });
|
||||
aux_rules.push_back({
|
||||
helper_rule_name,
|
||||
Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
|
||||
make_shared<Blank>() }) }),
|
||||
RuleEntryTypeHidden,
|
||||
});
|
||||
return repeat_symbol.copy();
|
||||
}
|
||||
|
||||
|
|
@ -62,22 +64,21 @@ class ExpandRepeats : public rules::IdentityRuleFn {
|
|||
return apply(rule);
|
||||
}
|
||||
|
||||
vector<pair<string, rule_ptr>> aux_rules;
|
||||
vector<RuleEntry> aux_rules;
|
||||
};
|
||||
|
||||
SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
|
||||
SyntaxGrammar result;
|
||||
result.aux_rules = grammar.aux_rules;
|
||||
result.rules = grammar.rules;
|
||||
result.ubiquitous_tokens = grammar.ubiquitous_tokens;
|
||||
result.expected_conflicts = grammar.expected_conflicts;
|
||||
|
||||
ExpandRepeats expander(result.aux_rules.size());
|
||||
for (auto &pair : grammar.rules)
|
||||
result.rules.push_back(
|
||||
{ pair.first, expander.expand(pair.second, pair.first) });
|
||||
ExpandRepeats expander(result.rules.size());
|
||||
for (auto &rule_entry : result.rules)
|
||||
rule_entry.rule = expander.expand(rule_entry.rule, rule_entry.name);
|
||||
|
||||
result.aux_rules.insert(result.aux_rules.end(), expander.aux_rules.begin(),
|
||||
expander.aux_rules.end());
|
||||
result.rules.insert(result.rules.end(), expander.aux_rules.begin(),
|
||||
expander.aux_rules.end());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,12 +2,14 @@
|
|||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include <map>
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/pattern.h"
|
||||
#include "compiler/rules/string.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/prepare_grammar/parse_regex.h"
|
||||
#include "utf8proc.h"
|
||||
|
|
@ -17,10 +19,12 @@ namespace prepare_grammar {
|
|||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::make_shared;
|
||||
using rules::String;
|
||||
using rules::Pattern;
|
||||
using rules::Metadata;
|
||||
|
||||
class ExpandTokens : public rules::IdentityRuleFn {
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
|
@ -40,7 +44,11 @@ class ExpandTokens : public rules::IdentityRuleFn {
|
|||
elements.push_back(rules::CharacterSet().include(el).copy());
|
||||
}
|
||||
|
||||
return rules::Seq::build(elements);
|
||||
return make_shared<rules::Metadata>(
|
||||
rules::Seq::build(elements),
|
||||
std::map<rules::MetadataKey, int>({
|
||||
{ rules::IS_TOKEN, 1 }, { rules::PRECEDENCE, 1 },
|
||||
}));
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const Pattern *rule) {
|
||||
|
|
@ -60,18 +68,11 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
|
|||
LexicalGrammar result;
|
||||
ExpandTokens expander;
|
||||
|
||||
for (auto &pair : grammar.rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
for (auto &entry : grammar.rules) {
|
||||
auto rule = expander.apply(entry.rule);
|
||||
if (expander.error)
|
||||
return { result, expander.error };
|
||||
result.rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
for (auto &pair : grammar.aux_rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { result, expander.error };
|
||||
result.aux_rules.push_back({ pair.first, rule });
|
||||
result.rules.push_back({ entry.name, rule, entry.type });
|
||||
}
|
||||
|
||||
for (auto &sep : grammar.separators) {
|
||||
|
|
@ -81,9 +82,7 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
|
|||
result.separators.push_back(rule);
|
||||
}
|
||||
|
||||
return {
|
||||
result, nullptr,
|
||||
};
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -4,8 +4,7 @@
|
|||
#include <set>
|
||||
#include <string>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
#include "compiler/rules/string.h"
|
||||
|
|
@ -21,27 +20,15 @@ using std::dynamic_pointer_cast;
|
|||
using std::make_shared;
|
||||
using std::make_tuple;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::tuple;
|
||||
using std::vector;
|
||||
using rules::Symbol;
|
||||
using rules::SymbolOptionToken;
|
||||
using rules::SymbolOptionAuxToken;
|
||||
|
||||
class SymbolReplacer : public rules::IdentityRuleFn {
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
||||
int new_index_for_symbol(const Symbol &symbol) {
|
||||
int result = symbol.index;
|
||||
for (const auto &pair : replacements)
|
||||
if (pair.first.index < symbol.index &&
|
||||
pair.first.is_auxiliary() == symbol.is_auxiliary())
|
||||
result--;
|
||||
return result;
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const Symbol *rule) {
|
||||
return replace_symbol(*rule).copy();
|
||||
}
|
||||
|
|
@ -49,54 +36,64 @@ class SymbolReplacer : public rules::IdentityRuleFn {
|
|||
public:
|
||||
map<Symbol, Symbol> replacements;
|
||||
|
||||
Symbol replace_symbol(const Symbol &rule) {
|
||||
if (rule.is_built_in())
|
||||
return rule;
|
||||
auto replacement_pair = replacements.find(rule);
|
||||
Symbol replace_symbol(const Symbol &symbol) {
|
||||
if (symbol.is_built_in() || symbol.is_token)
|
||||
return symbol;
|
||||
|
||||
auto replacement_pair = replacements.find(symbol);
|
||||
if (replacement_pair != replacements.end())
|
||||
return replacement_pair->second;
|
||||
else
|
||||
return Symbol(new_index_for_symbol(rule), rule.options);
|
||||
|
||||
int new_index = symbol.index;
|
||||
for (const auto &pair : replacements)
|
||||
if (pair.first.index < symbol.index)
|
||||
new_index--;
|
||||
return Symbol(new_index);
|
||||
}
|
||||
};
|
||||
|
||||
class TokenExtractor : public rules::IdentityRuleFn {
|
||||
rule_ptr apply_to_token(const Rule *input) {
|
||||
auto rule = input->copy();
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
||||
rule_ptr apply_to_token(const Rule *input, RuleEntryType entry_type) {
|
||||
for (size_t i = 0; i < tokens.size(); i++)
|
||||
if (tokens[i].second->operator==(*rule)) {
|
||||
if (tokens[i].rule->operator==(*input)) {
|
||||
token_usage_counts[i]++;
|
||||
return make_shared<Symbol>(i, SymbolOptionAuxToken);
|
||||
return make_shared<Symbol>(i, true);
|
||||
}
|
||||
|
||||
rule_ptr rule = input->copy();
|
||||
size_t index = tokens.size();
|
||||
tokens.push_back({ token_description(rule), rule });
|
||||
tokens.push_back({
|
||||
token_description(rule), rule, entry_type,
|
||||
});
|
||||
token_usage_counts.push_back(1);
|
||||
return make_shared<Symbol>(index, SymbolOptionAuxToken);
|
||||
return make_shared<Symbol>(index, true);
|
||||
}
|
||||
|
||||
rule_ptr default_apply(const Rule *rule) {
|
||||
auto result = rule->copy();
|
||||
if (is_token(result))
|
||||
return apply_to_token(rule);
|
||||
else
|
||||
return result;
|
||||
rule_ptr apply_to(const rules::String *rule) {
|
||||
return apply_to_token(rule, RuleEntryTypeAnonymous);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const rules::Pattern *rule) {
|
||||
return apply_to_token(rule, RuleEntryTypeHidden);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const rules::Metadata *rule) {
|
||||
if (is_token(rule->copy()))
|
||||
return apply_to_token(rule);
|
||||
if (rule->value_for(rules::IS_TOKEN) > 0)
|
||||
return apply_to_token(rule->rule.get(), RuleEntryTypeHidden);
|
||||
else
|
||||
return rules::IdentityRuleFn::apply_to(rule);
|
||||
}
|
||||
|
||||
public:
|
||||
vector<size_t> token_usage_counts;
|
||||
vector<pair<string, rule_ptr>> tokens;
|
||||
vector<RuleEntry> tokens;
|
||||
};
|
||||
|
||||
static const GrammarError *ubiq_token_err(const string &msg) {
|
||||
static const GrammarError *ubiq_token_err(const string &message) {
|
||||
return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
|
||||
"Not a token: " + msg);
|
||||
"Not a token: " + message);
|
||||
}
|
||||
|
||||
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
|
||||
|
|
@ -106,51 +103,43 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
|
|||
SymbolReplacer symbol_replacer;
|
||||
TokenExtractor extractor;
|
||||
|
||||
vector<pair<string, rule_ptr>> extracted_rules;
|
||||
for (auto &pair : grammar.rules)
|
||||
extracted_rules.push_back({ pair.first, extractor.apply(pair.second) });
|
||||
/*
|
||||
* First, extract all of the grammar's tokens into the lexical grammar.
|
||||
*/
|
||||
vector<RuleEntry> processed_rules;
|
||||
for (const auto &pair : grammar.rules)
|
||||
processed_rules.push_back({
|
||||
pair.first, extractor.apply(pair.second), RuleEntryTypeNamed,
|
||||
});
|
||||
lexical_grammar.rules = extractor.tokens;
|
||||
|
||||
/*
|
||||
* If a rule's entire content was extracted as a token and that token didn't
|
||||
* appear within any other rule, then remove that rule from the syntax
|
||||
* grammar, giving its name to the token in the lexical grammar. Any symbols
|
||||
* that pointed to that rule will need to be updated to point to the rule in
|
||||
* the lexical grammar. Symbols that pointed to later rules will need to have
|
||||
* their indices decremented.
|
||||
*/
|
||||
size_t i = 0;
|
||||
for (auto &pair : extracted_rules) {
|
||||
auto &rule = pair.second;
|
||||
auto symbol = dynamic_pointer_cast<const Symbol>(rule);
|
||||
if (symbol.get() && symbol->is_auxiliary() &&
|
||||
for (const RuleEntry &entry : processed_rules) {
|
||||
auto symbol = dynamic_pointer_cast<const Symbol>(entry.rule);
|
||||
if (symbol.get() && symbol->is_token && !symbol->is_built_in() &&
|
||||
extractor.token_usage_counts[symbol->index] == 1) {
|
||||
lexical_grammar.rules.push_back(
|
||||
{ pair.first, extractor.tokens[symbol->index].second });
|
||||
extractor.token_usage_counts[symbol->index] = 0;
|
||||
symbol_replacer.replacements.insert(
|
||||
{ Symbol(i),
|
||||
Symbol(lexical_grammar.rules.size() - 1, SymbolOptionToken) });
|
||||
lexical_grammar.rules[symbol->index].type = entry.type;
|
||||
lexical_grammar.rules[symbol->index].name = entry.name;
|
||||
symbol_replacer.replacements.insert({ Symbol(i), *symbol });
|
||||
} else {
|
||||
syntax_grammar.rules.push_back(pair);
|
||||
syntax_grammar.rules.push_back(entry);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
for (auto &pair : syntax_grammar.rules)
|
||||
pair.second = symbol_replacer.apply(pair.second);
|
||||
|
||||
lexical_grammar.aux_rules = extractor.tokens;
|
||||
|
||||
for (auto &rule : grammar.ubiquitous_tokens) {
|
||||
if (is_token(rule)) {
|
||||
lexical_grammar.separators.push_back(rule);
|
||||
} else {
|
||||
auto sym = dynamic_pointer_cast<const Symbol>(extractor.apply(rule));
|
||||
if (!sym.get())
|
||||
return make_tuple(syntax_grammar, lexical_grammar,
|
||||
ubiq_token_err(rule->to_string()));
|
||||
|
||||
Symbol symbol = symbol_replacer.replace_symbol(*sym);
|
||||
if (!symbol.is_token())
|
||||
return make_tuple(
|
||||
syntax_grammar, lexical_grammar,
|
||||
ubiq_token_err(syntax_grammar.rules[symbol.index].first));
|
||||
|
||||
syntax_grammar.ubiquitous_tokens.insert(symbol);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Perform any replacements of symbols needed based on the previous step.
|
||||
*/
|
||||
for (RuleEntry &entry : syntax_grammar.rules)
|
||||
entry.rule = symbol_replacer.apply(entry.rule);
|
||||
|
||||
for (auto &symbol_set : grammar.expected_conflicts) {
|
||||
set<Symbol> new_symbol_set;
|
||||
|
|
@ -159,6 +148,33 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
|
|||
syntax_grammar.expected_conflicts.insert(new_symbol_set);
|
||||
}
|
||||
|
||||
/*
|
||||
* The grammar's ubiquitous tokens can be either token rules or symbols
|
||||
* pointing to token rules. If they are symbols, then they'll be handled by
|
||||
* the parser; add them to the syntax grammar's ubiqutous tokens. If they
|
||||
* are anonymous rules, they can be handled by the lexer; add them to the
|
||||
* lexical grammar's separator rules.
|
||||
*/
|
||||
for (const rule_ptr &rule : grammar.ubiquitous_tokens) {
|
||||
if (is_token(rule)) {
|
||||
lexical_grammar.separators.push_back(rule);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto symbol = dynamic_pointer_cast<const Symbol>(rule);
|
||||
if (!symbol.get())
|
||||
return make_tuple(syntax_grammar, lexical_grammar,
|
||||
ubiq_token_err(rule->to_string()));
|
||||
|
||||
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
|
||||
if (!new_symbol.is_token)
|
||||
return make_tuple(
|
||||
syntax_grammar, lexical_grammar,
|
||||
ubiq_token_err(syntax_grammar.rules[new_symbol.index].name));
|
||||
|
||||
syntax_grammar.ubiquitous_tokens.insert(new_symbol);
|
||||
}
|
||||
|
||||
return make_tuple(syntax_grammar, lexical_grammar, nullptr);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@
|
|||
#include "compiler/prepare_grammar/extract_tokens.h"
|
||||
#include "compiler/prepare_grammar/intern_symbols.h"
|
||||
#include "compiler/prepare_grammar/prepare_grammar.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@
|
|||
#define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_
|
||||
|
||||
#include <utility>
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/string.h"
|
||||
#include "compiler/rules/repeat.h"
|
||||
#include "compiler/rules/metadata.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
|
||||
|
|
@ -15,11 +16,12 @@ using std::string;
|
|||
|
||||
class TokenDescription : public rules::RuleFn<string> {
|
||||
string apply_to(const rules::Pattern *rule) {
|
||||
return "PAT_" + util::escape_string(rule->value);
|
||||
is_trivial = false;
|
||||
return rule->value;
|
||||
}
|
||||
|
||||
string apply_to(const rules::String *rule) {
|
||||
return "STR_" + util::escape_string(rule->value);
|
||||
return rule->value;
|
||||
}
|
||||
|
||||
string apply_to(const rules::Metadata *rule) {
|
||||
|
|
@ -27,19 +29,41 @@ class TokenDescription : public rules::RuleFn<string> {
|
|||
}
|
||||
|
||||
string apply_to(const rules::Seq *rule) {
|
||||
return "(seq " + apply(rule->left) + " " + apply(rule->right) + ")";
|
||||
is_trivial = false;
|
||||
return apply(rule->left) + apply(rule->right);
|
||||
}
|
||||
|
||||
string apply_to(const rules::Repeat *rule) {
|
||||
is_trivial = false;
|
||||
return apply(rule->content) + "*";
|
||||
}
|
||||
|
||||
string apply_to(const rules::Choice *rule) {
|
||||
string result = "(choice";
|
||||
for (auto &element : rule->elements)
|
||||
result += " " + apply(element);
|
||||
is_trivial = false;
|
||||
string result = "(";
|
||||
bool started = false;
|
||||
for (auto &element : rule->elements) {
|
||||
if (started)
|
||||
result += "|";
|
||||
result += apply(element);
|
||||
started = true;
|
||||
}
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
public:
|
||||
bool is_trivial;
|
||||
|
||||
TokenDescription() : is_trivial(true) {}
|
||||
};
|
||||
|
||||
std::string token_description(const rule_ptr &rule) {
|
||||
return TokenDescription().apply(rule);
|
||||
string token_description(const rule_ptr &rule) {
|
||||
TokenDescription description;
|
||||
string result = description.apply(rule);
|
||||
if (description.is_trivial)
|
||||
return result;
|
||||
else
|
||||
return "/" + result + "/";
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
39
src/compiler/prepared_grammar.h
Normal file
39
src/compiler/prepared_grammar.h
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#ifndef COMPILER_PREPARED_GRAMMAR_H_
|
||||
#define COMPILER_PREPARED_GRAMMAR_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
enum RuleEntryType {
|
||||
RuleEntryTypeNamed,
|
||||
RuleEntryTypeAnonymous,
|
||||
RuleEntryTypeHidden,
|
||||
};
|
||||
|
||||
struct RuleEntry {
|
||||
std::string name;
|
||||
rule_ptr rule;
|
||||
RuleEntryType type;
|
||||
};
|
||||
|
||||
class SyntaxGrammar {
|
||||
public:
|
||||
std::vector<RuleEntry> rules;
|
||||
std::set<rules::Symbol> ubiquitous_tokens;
|
||||
std::set<std::set<rules::Symbol>> expected_conflicts;
|
||||
};
|
||||
|
||||
class LexicalGrammar {
|
||||
public:
|
||||
std::vector<RuleEntry> rules;
|
||||
std::vector<rule_ptr> separators;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_PREPARED_GRAMMAR_H_
|
||||
|
|
@ -4,11 +4,11 @@ namespace tree_sitter {
|
|||
namespace rules {
|
||||
|
||||
Symbol END_OF_INPUT() {
|
||||
return Symbol(-1, SymbolOptionToken);
|
||||
return Symbol(-1, true);
|
||||
}
|
||||
|
||||
Symbol ERROR() {
|
||||
return Symbol(-2, SymbolOptionToken);
|
||||
return Symbol(-2, true);
|
||||
}
|
||||
|
||||
Symbol START() {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ rule_ptr pattern(const string &value) {
|
|||
}
|
||||
|
||||
rule_ptr str(const string &value) {
|
||||
return token(prec(1, make_shared<rules::String>(value)));
|
||||
return make_shared<rules::String>(value);
|
||||
}
|
||||
|
||||
rule_ptr err(const rule_ptr &rule) {
|
||||
|
|
|
|||
|
|
@ -10,16 +10,12 @@ using std::string;
|
|||
using std::to_string;
|
||||
using std::hash;
|
||||
|
||||
SymbolOption SymbolOptionAuxToken =
|
||||
SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary);
|
||||
Symbol::Symbol(int index) : index(index), is_token(false) {}
|
||||
|
||||
Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {}
|
||||
|
||||
Symbol::Symbol(int index, SymbolOption options)
|
||||
: index(index), options(options) {}
|
||||
Symbol::Symbol(int index, bool is_token) : index(index), is_token(is_token) {}
|
||||
|
||||
bool Symbol::operator==(const Symbol &other) const {
|
||||
return (other.index == index) && (other.options == options);
|
||||
return (other.index == index) && (other.is_token == is_token);
|
||||
}
|
||||
|
||||
bool Symbol::operator==(const Rule &rule) const {
|
||||
|
|
@ -28,7 +24,7 @@ bool Symbol::operator==(const Rule &rule) const {
|
|||
}
|
||||
|
||||
size_t Symbol::hash_code() const {
|
||||
return hash<int>()(index) ^ hash<int16_t>()(options);
|
||||
return hash<int>()(index) ^ hash<bool>()(is_token);
|
||||
}
|
||||
|
||||
rule_ptr Symbol::copy() const {
|
||||
|
|
@ -36,31 +32,22 @@ rule_ptr Symbol::copy() const {
|
|||
}
|
||||
|
||||
string Symbol::to_string() const {
|
||||
string name = (options & SymbolOptionAuxiliary) ? "aux_" : "";
|
||||
name += (options & SymbolOptionToken) ? "token" : "sym";
|
||||
string name = is_token ? "token" : "sym";
|
||||
return "(" + name + " " + std::to_string(index) + ")";
|
||||
}
|
||||
|
||||
bool Symbol::operator<(const Symbol &other) const {
|
||||
if (options < other.options)
|
||||
if (!is_token && other.is_token)
|
||||
return true;
|
||||
if (options > other.options)
|
||||
if (is_token && !other.is_token)
|
||||
return false;
|
||||
return (index < other.index);
|
||||
}
|
||||
|
||||
bool Symbol::is_token() const {
|
||||
return options & SymbolOptionToken;
|
||||
}
|
||||
|
||||
bool Symbol::is_built_in() const {
|
||||
return index < 0;
|
||||
}
|
||||
|
||||
bool Symbol::is_auxiliary() const {
|
||||
return options & SymbolOptionAuxiliary;
|
||||
}
|
||||
|
||||
void Symbol::accept(Visitor *visitor) const {
|
||||
visitor->visit(this);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,17 +7,10 @@
|
|||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
||||
typedef enum {
|
||||
SymbolOptionToken = 1 << 0,
|
||||
SymbolOptionAuxiliary = 1 << 1,
|
||||
} SymbolOption;
|
||||
|
||||
extern SymbolOption SymbolOptionAuxToken;
|
||||
|
||||
class Symbol : public Rule {
|
||||
public:
|
||||
explicit Symbol(int index);
|
||||
Symbol(int index, SymbolOption options);
|
||||
Symbol(int index, bool is_token);
|
||||
|
||||
bool operator==(const Symbol &other) const;
|
||||
bool operator==(const Rule &other) const;
|
||||
|
|
@ -28,12 +21,10 @@ class Symbol : public Rule {
|
|||
void accept(Visitor *visitor) const;
|
||||
|
||||
bool operator<(const Symbol &other) const;
|
||||
bool is_token() const;
|
||||
bool is_built_in() const;
|
||||
bool is_auxiliary() const;
|
||||
|
||||
int index;
|
||||
SymbolOption options;
|
||||
bool is_token;
|
||||
};
|
||||
|
||||
} // namespace rules
|
||||
|
|
|
|||
|
|
@ -1,21 +0,0 @@
|
|||
#include "compiler/syntax_grammar.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
using std::string;
|
||||
|
||||
const rule_ptr &SyntaxGrammar::rule(const rules::Symbol &symbol) const {
|
||||
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
|
||||
: rules[symbol.index].second;
|
||||
}
|
||||
|
||||
const string &SyntaxGrammar::rule_name(const rules::Symbol &symbol) const {
|
||||
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
|
||||
: rules[symbol.index].first;
|
||||
}
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
#ifndef COMPILER_SYNTAX_GRAMMAR_H_
|
||||
#define COMPILER_SYNTAX_GRAMMAR_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
class SyntaxGrammar {
|
||||
public:
|
||||
const std::string &rule_name(const rules::Symbol &symbol) const;
|
||||
const rule_ptr &rule(const rules::Symbol &symbol) const;
|
||||
|
||||
std::vector<std::pair<std::string, rule_ptr>> rules;
|
||||
std::vector<std::pair<std::string, rule_ptr>> aux_rules;
|
||||
std::set<rules::Symbol> ubiquitous_tokens;
|
||||
std::set<std::set<rules::Symbol>> expected_conflicts;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_SYNTAX_GRAMMAR_H_
|
||||
Loading…
Add table
Add a link
Reference in a new issue