Separate syntax rules into flat lists of symbols

This way, every ParseItem can be associated with a particular production
for its non-terminal. That lets us keep track of which productions are
involved in shift/reduce conflicts.
This commit is contained in:
Max Brunsfeld 2015-01-11 23:21:58 -08:00
parent 68a0e16d1e
commit 52daffb3f3
37 changed files with 842 additions and 426 deletions

View file

@ -5,7 +5,6 @@
#include "tree_sitter/compiler.h"
#include "compiler/parse_table.h"
#include "compiler/rules/symbol.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -40,12 +40,8 @@ class ParseTableBuilder {
: grammar(grammar), lex_grammar(lex_grammar) {}
pair<ParseTable, vector<Conflict>> build() {
auto start_symbol = grammar.rules.empty()
? make_shared<Symbol>(0, rules::SymbolOptionToken)
: make_shared<Symbol>(0);
ParseItem start_item(rules::START(), start_symbol, 0);
add_parse_state(
item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
ParseItem start_item(rules::START(), 0, -2, 0);
add_parse_state(item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
while (!item_sets_to_process.empty()) {
auto pair = item_sets_to_process.back();
@ -100,12 +96,13 @@ class ParseTableBuilder {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
if (item.is_done()) {
if (item_is_done(item)) {
ParseAction action =
(item.lhs == rules::START())
? ParseAction::Accept()
: ParseAction::Reduce(item.lhs, item.consumed_symbol_count,
item.precedence());
item_precedence(item));
for (const auto &lookahead_sym : lookahead_symbols)
if (should_add_action(state_id, lookahead_sym, action, ParseItemSet()))
@ -170,11 +167,19 @@ class ParseTableBuilder {
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
if (item.consumed_symbol_count > 0)
result.insert(item.precedence());
result.insert(item_precedence(item));
}
return result;
}
bool item_is_done(const ParseItem &item) {
return item.consumed_symbol_count == grammar.productions(item.lhs)[item.production_index].size();
}
int item_precedence(const ParseItem &item) {
return grammar.productions(item.lhs)[item.production_index].precedence_at(item.consumed_symbol_count - 1);
}
void record_conflict(const Symbol &sym, const ParseAction &left,
const ParseAction &right, const ParseItemSet &item_set) {
conflicts.insert(

View file

@ -1,68 +0,0 @@
#include "compiler/build_tables/first_symbols.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace build_tables {
using std::set;
using rules::Symbol;
class FirstSymbols : public rules::RuleFn<set<Symbol>> {
const SyntaxGrammar *grammar;
set<Symbol> visited_symbols;
public:
explicit FirstSymbols(const SyntaxGrammar *grammar) : grammar(grammar) {}
private:
set<Symbol> apply_to(const Symbol *rule) {
auto insertion_result = visited_symbols.insert(*rule);
if (!insertion_result.second)
return set<Symbol>();
set<Symbol> result({ *rule });
if (!rule->is_token()) {
set<Symbol> &&symbols = apply(grammar->rule(*rule));
result.insert(symbols.begin(), symbols.end());
}
return result;
}
set<Symbol> apply_to(const rules::Metadata *rule) {
return apply(rule->rule);
}
set<Symbol> apply_to(const rules::Choice *rule) {
set<Symbol> result;
for (const auto &element : rule->elements) {
auto &&element_symbols = apply(element);
result.insert(element_symbols.begin(), element_symbols.end());
}
return result;
}
set<Symbol> apply_to(const rules::Seq *rule) {
auto &&result = apply(rule->left);
if (rule_can_be_blank(rule->left, *grammar)) {
auto &&right_symbols = apply(rule->right);
result.insert(right_symbols.begin(), right_symbols.end());
}
return result;
}
};
set<Symbol> first_symbols(const rules::rule_ptr &rule,
const SyntaxGrammar &grammar) {
return FirstSymbols(&grammar).apply(rule);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,24 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_
#define COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_
#include <set>
#include "compiler/rules/symbol.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace build_tables {
/*
* Returns the set of symbols that can appear at the beginning of a sentential
* form derivable from a given rule in a given grammar.
*/
std::set<rules::Symbol> first_symbols(const rules::rule_ptr &rule,
const SyntaxGrammar &grammar);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_

View file

@ -3,7 +3,6 @@
#include <vector>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/build_tables/first_symbols.h"
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/build_tables/item.h"
@ -34,24 +33,41 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
size_t previous_size = lookahead_symbols.size();
lookahead_symbols.insert(new_lookahead_symbols.begin(),
new_lookahead_symbols.end());
if (lookahead_symbols.size() == previous_size)
continue;
for (const auto &pair : sym_transitions(item.rule)) {
const Symbol &symbol = pair.first;
const rule_ptr &next_rule = pair.second;
const Production &item_production = grammar.productions(item.lhs)[item.production_index];
if (item_production.size() <= item.consumed_symbol_count)
continue;
if (symbol.is_token() || symbol.is_built_in())
continue;
Symbol symbol = item_production.symbol_at(item.consumed_symbol_count);
if (symbol.is_token() || symbol.is_built_in())
continue;
set<Symbol> next_lookahead_symbols = first_symbols(next_rule, grammar);
if (rule_can_be_blank(next_rule, grammar))
next_lookahead_symbols.insert(lookahead_symbols.begin(),
lookahead_symbols.end());
set<Symbol> next_lookahead_symbols;
if (item.consumed_symbol_count + 1 >= item_production.size()) {
next_lookahead_symbols = lookahead_symbols;
} else {
vector<Symbol> symbols_to_process({ item_production.symbol_at(item.consumed_symbol_count + 1) });
items_to_process.push_back({ ParseItem(symbol, grammar.rule(symbol), 0),
next_lookahead_symbols });
while (!symbols_to_process.empty()) {
Symbol following_symbol = symbols_to_process.back();
symbols_to_process.pop_back();
if (!next_lookahead_symbols.insert(following_symbol).second)
continue;
for (const auto &production : grammar.productions(following_symbol))
symbols_to_process.push_back(production.symbol_at(0));
}
}
size_t i = 0;
for (const Production &production : grammar.productions(symbol)) {
items_to_process.push_back({
ParseItem(symbol, i, production.rule_id_at(0), 0),
next_lookahead_symbols
});
i++;
}
}

View file

@ -21,18 +21,30 @@ map<Symbol, ParseItemSet> sym_transitions(const ParseItemSet &item_set,
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
for (auto &transition : sym_transitions(item.rule)) {
ParseItem new_item(item.lhs, transition.second,
item.consumed_symbol_count + 1);
merge_sym_transition<ParseItemSet>(
&result, { transition.first,
item_set_closure(new_item, lookahead_symbols, grammar) },
[](ParseItemSet *left, const ParseItemSet *right) {
for (auto &pair : *right)
left->operator[](pair.first)
.insert(pair.second.begin(), pair.second.end());
});
}
const auto &productions = grammar.productions(item.lhs);
if (productions.empty())
continue;
const Production &production = grammar.productions(item.lhs)[item.production_index];
if (production.size() <= item.consumed_symbol_count)
continue;
const Symbol &symbol = production.symbol_at(item.consumed_symbol_count);
ParseItem new_item(
item.lhs,
item.production_index,
production.rule_id_at(item.consumed_symbol_count + 1),
item.consumed_symbol_count + 1
);
merge_sym_transition<ParseItemSet>(
&result,
{ symbol, item_set_closure(new_item, { lookahead_symbols }, grammar) },
[](ParseItemSet *left, const ParseItemSet *right) {
for (auto &pair : *right)
left->operator[](pair.first)
.insert(pair.second.begin(), pair.second.end());
});
}
return result;
}

View file

@ -1,25 +1,32 @@
#include "compiler/build_tables/parse_item.h"
#include "compiler/syntax_grammar.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace build_tables {
using std::string;
using std::to_string;
using std::ostream;
ParseItem::ParseItem(const rules::Symbol &lhs, const rules::rule_ptr rule,
size_t consumed_symbol_count)
: Item(lhs, rule), consumed_symbol_count(consumed_symbol_count) {}
ParseItem::ParseItem(const rules::Symbol &lhs, size_t production_index,
int rule_id, size_t consumed_symbol_count)
: lhs(lhs), production_index(production_index),
rule_id(rule_id),
consumed_symbol_count(consumed_symbol_count) {}
bool ParseItem::operator==(const ParseItem &other) const {
return (lhs == other.lhs) &&
(consumed_symbol_count == other.consumed_symbol_count) &&
(rule == other.rule || rule->operator==(*other.rule));
(rule_id == other.rule_id) &&
(consumed_symbol_count == other.consumed_symbol_count);
}
ostream &operator<<(ostream &stream, const ParseItem &item) {
return stream << string("(item ") << item.lhs << string(" ") << *item.rule
<< string(")");
return stream << string("(item lhs:") << item.lhs
<< string(" index:") << to_string(item.production_index)
<< string(" remaining_rule:") << to_string(item.rule_id)
<< string(" consumed:") << to_string(item.consumed_symbol_count)
<< string(")");
}
} // namespace build_tables

View file

@ -9,11 +9,15 @@
namespace tree_sitter {
namespace build_tables {
class ParseItem : public Item {
class ParseItem {
public:
ParseItem(const rules::Symbol &lhs, rules::rule_ptr rule,
const size_t consumed_symbol_count);
ParseItem(const rules::Symbol &lhs, size_t production_index,
int rule_id, size_t consumed_symbol_count);
bool operator==(const ParseItem &other) const;
rules::Symbol lhs;
size_t production_index;
int rule_id;
size_t consumed_symbol_count;
};
@ -30,8 +34,8 @@ template <>
struct hash<tree_sitter::build_tables::ParseItem> {
size_t operator()(const tree_sitter::build_tables::ParseItem &item) const {
return hash<tree_sitter::rules::Symbol>()(item.lhs) ^
hash<tree_sitter::rules::rule_ptr>()(item.rule) ^
hash<size_t>()(item.consumed_symbol_count);
hash<int>()(item.rule_id) ^
hash<size_t>()(item.consumed_symbol_count);
}
};

View file

@ -1,7 +1,5 @@
#include "compiler/build_tables/rule_can_be_blank.h"
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
@ -12,8 +10,6 @@
namespace tree_sitter {
namespace build_tables {
using std::set;
class CanBeBlank : public rules::RuleFn<bool> {
protected:
bool apply_to(const rules::Blank *) { return true; }
@ -34,36 +30,9 @@ class CanBeBlank : public rules::RuleFn<bool> {
bool apply_to(const rules::Metadata *rule) { return apply(rule->rule); }
};
class CanBeBlankRecursive : public CanBeBlank {
const SyntaxGrammar *grammar;
set<rules::Symbol> visited_symbols;
using CanBeBlank::visit;
public:
explicit CanBeBlankRecursive(const SyntaxGrammar *grammar)
: grammar(grammar) {}
private:
using CanBeBlank::apply_to;
bool apply_to(const rules::Symbol *rule) {
if (visited_symbols.find(*rule) == visited_symbols.end()) {
visited_symbols.insert(*rule);
return !rule->is_token() && apply(grammar->rule(*rule));
} else {
return false;
}
}
};
bool rule_can_be_blank(const rules::rule_ptr &rule) {
return CanBeBlank().apply(rule);
}
bool rule_can_be_blank(const rules::rule_ptr &rule,
const SyntaxGrammar &grammar) {
return CanBeBlankRecursive(&grammar).apply(rule);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -4,14 +4,9 @@
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace build_tables {
bool rule_can_be_blank(const rules::rule_ptr &rule);
bool rule_can_be_blank(const rules::rule_ptr &rule,
const SyntaxGrammar &grammar);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -2,7 +2,7 @@
#include <vector>
#include <string>
#include <utility>
#include "compiler/syntax_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
@ -50,7 +50,7 @@ class ExpandRepeats : public rules::IdentityRuleFn {
vector<pair<string, rules::rule_ptr>> aux_rules;
};
SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
vector<pair<string, rules::rule_ptr>> rules, aux_rules(grammar.aux_rules);
for (auto &pair : grammar.rules) {
@ -60,7 +60,7 @@ SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
expander.aux_rules.end());
}
return SyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens);
return InitialSyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens);
}
} // namespace prepare_grammar

View file

@ -4,12 +4,11 @@
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class SyntaxGrammar;
namespace prepare_grammar {
SyntaxGrammar expand_repeats(const SyntaxGrammar &);
class InitialSyntaxGrammar;
InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,58 @@
#include "compiler/prepare_grammar/extract_choices.h"
#include <vector>
#include <memory>
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/repeat.h"
namespace tree_sitter {
namespace prepare_grammar {
using std::make_shared;
using std::vector;
using rules::rule_ptr;
class ExtractChoices : public rules::RuleFn<vector<rule_ptr>> {
vector<rule_ptr> default_apply(const rules::Rule *rule) {
return vector<rule_ptr>({ rule->copy() });
}
vector<rule_ptr> apply_to(const rules::Seq *rule) {
vector<rule_ptr> result;
for (auto left_entry : apply(rule->left))
for (auto right_entry : apply(rule->right))
result.push_back(rules::Seq::build({ left_entry, right_entry }));
return result;
}
vector<rule_ptr> apply_to(const rules::Metadata *rule) {
vector<rule_ptr> result;
for (auto entry : apply(rule->rule))
result.push_back(make_shared<rules::Metadata>(entry, rule->value));
return result;
}
vector<rule_ptr> apply_to(const rules::Choice *rule) {
vector<rule_ptr> result;
for (auto element : rule->elements)
for (auto entry : apply(element))
result.push_back(entry);
return result;
}
vector<rule_ptr> apply_to(const rules::Repeat *rule) {
vector<rule_ptr> result;
for (auto element : apply(rule->content))
result.push_back(make_shared<rules::Repeat>(element));
return result;
}
};
std::vector<rules::rule_ptr> extract_choices(const rules::rule_ptr &rule) {
return ExtractChoices().apply(rule);
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,15 @@
#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_
#include <vector>
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace prepare_grammar {
std::vector<rules::rule_ptr> extract_choices(const rules::rule_ptr &);
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_

View file

@ -5,7 +5,7 @@
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/string.h"
@ -92,14 +92,14 @@ class TokenExtractor : public rules::IdentityRuleFn {
vector<pair<string, rule_ptr>> tokens;
};
static tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> ubiq_token_err(
static tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> ubiq_token_err(
const string &msg) {
return make_tuple(SyntaxGrammar(), LexicalGrammar(),
return make_tuple(InitialSyntaxGrammar(), LexicalGrammar(),
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: " + msg));
}
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const Grammar &grammar) {
vector<pair<string, rule_ptr>> rules, tokens;
vector<rule_ptr> separators;
@ -139,7 +139,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
}
}
return make_tuple(SyntaxGrammar(rules, {}, ubiquitous_tokens),
return make_tuple(InitialSyntaxGrammar(rules, {}, ubiquitous_tokens),
LexicalGrammar(tokens, extractor.tokens, separators),
nullptr);
}

View file

@ -7,12 +7,13 @@
namespace tree_sitter {
class Grammar;
class SyntaxGrammar;
class LexicalGrammar;
namespace prepare_grammar {
std::tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
class InitialSyntaxGrammar;
std::tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
const Grammar &);
} // namespace prepare_grammar

View file

@ -0,0 +1,145 @@
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/prepare_grammar/extract_choices.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/metadata.h"
#include <string>
#include <algorithm>
namespace tree_sitter {
namespace prepare_grammar {
using std::find;
using std::pair;
using std::string;
using std::vector;
using rules::rule_ptr;
class FlattenRule : public rules::RuleFn<void> {
public:
bool has_pending_precedence;
int pending_precedence;
vector<int> precedence_stack;
vector<ProductionEntry> entries;
FlattenRule() : has_pending_precedence(false), pending_precedence(0) {}
void apply_to(const rules::Symbol *sym) {
entries.push_back({ *sym, current_precedence(), 0 });
if (has_pending_precedence) {
precedence_stack.push_back(pending_precedence);
has_pending_precedence = false;
}
}
void apply_to(const rules::Metadata *metadata) {
int precedence = metadata->value_for(rules::PRECEDENCE);
if (precedence != 0) {
pending_precedence = precedence;
has_pending_precedence = true;
apply(metadata->rule);
precedence_stack.pop_back();
} else {
apply(metadata->rule);
}
}
void apply_to(const rules::Seq *seq) {
apply(seq->left);
apply(seq->right);
}
private:
int current_precedence() {
if (precedence_stack.empty())
return 0;
else
return *precedence_stack.rbegin();
}
};
Production flatten_rule(const rule_ptr &rule) {
FlattenRule flattener;
flattener.apply(rule);
return Production(flattener.entries, 0);
}
struct ProductionSlice {
vector<ProductionEntry>::const_iterator start;
vector<ProductionEntry>::const_iterator end;
int end_precedence;
bool operator==(const ProductionSlice &other) const {
if (end_precedence != other.end_precedence) return false;
if (end - start != other.end - other.start) return false;
for (auto iter1 = start, iter2 = other.start; iter1 != end; ++iter1, ++iter2)
if (!(iter1->symbol == iter2->symbol) || iter1->precedence != iter2->precedence)
return false;
return true;
}
};
void assign_rule_ids(Production *production, vector<ProductionSlice> *unique_slices) {
auto &entries = production->entries;
auto end = entries.end();
for (auto iter = entries.begin(); iter != end; ++iter) {
ProductionSlice slice{iter, end, 0};
auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice);
if (existing_id == unique_slices->end()) {
unique_slices->push_back(slice);
iter->rule_id = unique_slices->size() - 1;
} else {
iter->rule_id = existing_id - unique_slices->cbegin();
}
}
ProductionSlice slice{end, end, production->precedence_at(production->size() - 1)};
auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice);
if (existing_id == unique_slices->end()) {
unique_slices->push_back(slice);
production->end_rule_id = unique_slices->size() - 1;
} else {
production->end_rule_id = existing_id - unique_slices->cbegin();
}
}
SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) {
vector<pair<string, vector<Production>>> rules, aux_rules;
for (const auto &pair : grammar.rules) {
vector<Production> productions;
for (const auto &rule_component : extract_choices(pair.second))
productions.push_back(flatten_rule(rule_component));
rules.push_back({ pair.first, productions });
}
for (const auto &pair : grammar.aux_rules) {
vector<Production> productions;
for (const auto &rule_component : extract_choices(pair.second))
productions.push_back(flatten_rule(rule_component));
aux_rules.push_back({ pair.first, productions });
}
if (rules.empty()) {
rules.push_back({
"START",
{ Production({ {rules::Symbol(0, rules::SymbolOptionToken), 0, 0} }, 0) }
});
}
vector<ProductionSlice> unique_slices;
for (auto &pair : rules)
for (Production &production : pair.second)
assign_rule_ids(&production, &unique_slices);
for (auto &pair : aux_rules)
for (Production &production : pair.second)
assign_rule_ids(&production, &unique_slices);
return SyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens);
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,13 @@
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {
class InitialSyntaxGrammar;
SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,37 @@
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include <vector>
#include <string>
#include <utility>
#include "compiler/rules/symbol.h"
namespace tree_sitter {
namespace prepare_grammar {
using std::string;
using std::pair;
using std::vector;
using std::set;
InitialSyntaxGrammar::InitialSyntaxGrammar() {}
InitialSyntaxGrammar::InitialSyntaxGrammar(const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules)
: rules(rules), aux_rules(aux_rules) {}
InitialSyntaxGrammar::InitialSyntaxGrammar(const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules,
const set<rules::Symbol> &ubiquitous_tokens)
: rules(rules), aux_rules(aux_rules), ubiquitous_tokens(ubiquitous_tokens) {}
const rules::rule_ptr &InitialSyntaxGrammar::rule(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
: rules[symbol.index].second;
}
const string &InitialSyntaxGrammar::rule_name(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
: rules[symbol.index].first;
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,36 @@
#ifndef COMPILER_INITIAL_SYNTAX_GRAMMAR_H_
#define COMPILER_INITIAL_SYNTAX_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
namespace prepare_grammar {
class InitialSyntaxGrammar {
public:
InitialSyntaxGrammar();
InitialSyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
InitialSyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
const std::set<rules::Symbol> &ubiquitous_tokens);
const std::string &rule_name(const rules::Symbol &symbol) const;
const rules::rule_ptr &rule(const rules::Symbol &symbol) const;
const std::vector<std::pair<std::string, rules::rule_ptr>> rules;
const std::vector<std::pair<std::string, rules::rule_ptr>> aux_rules;
std::set<rules::Symbol> ubiquitous_tokens;
};
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_INITIAL_SYNTAX_GRAMMAR_H_

View file

@ -1,9 +1,11 @@
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/prepare_grammar/expand_repeats.h"
#include "compiler/prepare_grammar/expand_tokens.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "compiler/prepare_grammar/prepare_grammar.h"
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
@ -29,7 +31,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
// Replace `Repeat` rules with pairs of recursive rules
const SyntaxGrammar &syntax_grammar = expand_repeats(get<0>(extract_result));
const InitialSyntaxGrammar &syntax_grammar = expand_repeats(get<0>(extract_result));
// Expand `String` and `Pattern` rules into full rule trees
auto expand_tokens_result = expand_tokens(get<1>(extract_result));
@ -38,7 +40,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
if (error)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
return make_tuple(syntax_grammar, lex_grammar, nullptr);
return make_tuple(flatten_grammar(syntax_grammar), lex_grammar, nullptr);
}
} // namespace prepare_grammar

View file

@ -3,33 +3,92 @@
#include <string>
#include <utility>
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
namespace tree_sitter {
using std::string;
using std::to_string;
using std::pair;
using std::vector;
using std::set;
static const vector<Production> START_PRODUCTIONS({
Production({ {rules::Symbol(0), 0, -1} }, 2)
});
static const vector<Production> NO_PRODUCTIONS({});
bool ProductionEntry::operator==(const ProductionEntry &other) const {
return symbol == other.symbol && precedence == other.precedence &&
rule_id == other.rule_id;
}
Production::Production(const vector<ProductionEntry> &entries, int last_rule_id) :
entries(entries), end_rule_id(last_rule_id) {}
int Production::precedence_at(size_t index) const {
if (index >= size())
return 0;
else
return entries[index].precedence;
}
int Production::rule_id_at(size_t index) const {
if (index >= size())
return end_rule_id;
else
return entries[index].rule_id;
}
const rules::Symbol &Production::symbol_at(size_t index) const {
return entries[index].symbol;
}
size_t Production::size() const {
return entries.size();
}
SyntaxGrammar::SyntaxGrammar() {}
SyntaxGrammar::SyntaxGrammar(const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules)
: rules(rules), aux_rules(aux_rules) {}
SyntaxGrammar::SyntaxGrammar(const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules,
const set<rules::Symbol> &ubiquitous_tokens)
SyntaxGrammar::SyntaxGrammar(
const vector<pair<string, vector<Production>>> &rules,
const vector<pair<string, vector<Production>>> &aux_rules,
const set<rules::Symbol> &ubiquitous_tokens)
: rules(rules), aux_rules(aux_rules), ubiquitous_tokens(ubiquitous_tokens) {}
const rules::rule_ptr &SyntaxGrammar::rule(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].second
: rules[symbol.index].second;
}
const string &SyntaxGrammar::rule_name(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ? aux_rules[symbol.index].first
: rules[symbol.index].first;
}
const vector<Production> &SyntaxGrammar::productions(const rules::Symbol &symbol) const {
if (symbol == rules::START())
return START_PRODUCTIONS;
if (symbol.is_built_in() || symbol.is_token())
return NO_PRODUCTIONS;
if (symbol.is_auxiliary())
return aux_rules[symbol.index].second;
else
return rules[symbol.index].second;
}
std::ostream &operator<<(std::ostream &stream, const ProductionEntry &entry) {
return stream << string("(entry symbol:") << entry.symbol <<
string(" precedence: ") << to_string(entry.precedence) <<
string(" id: ") << to_string(entry.rule_id) << string(")");
}
std::ostream &operator<<(std::ostream &stream, const Production &production) {
stream << string("(production entries: (");
bool started = false;
for (const auto &entry : production.entries) {
if (started) stream << string(" ");
stream << entry;
started = true;
}
return stream << string(") end_rule_id: ") <<
to_string(production.end_rule_id) << string(")");
}
} // namespace tree_sitter

View file

@ -10,22 +10,41 @@
namespace tree_sitter {
struct ProductionEntry {
rules::Symbol symbol;
int precedence;
int rule_id;
bool operator==(const ProductionEntry &) const;
};
class Production {
public:
std::vector<ProductionEntry> entries;
int end_rule_id;
Production(const std::vector<ProductionEntry> &, int);
size_t size() const;
const rules::Symbol &symbol_at(size_t) const;
int precedence_at(size_t) const;
int rule_id_at(size_t) const;
};
std::ostream &operator<<(std::ostream &, const ProductionEntry &);
std::ostream &operator<<(std::ostream &, const Production &);
class SyntaxGrammar {
public:
SyntaxGrammar();
SyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
SyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
const std::vector<std::pair<std::string, std::vector<Production>>> &rules,
const std::vector<std::pair<std::string, std::vector<Production>>> &aux_rules,
const std::set<rules::Symbol> &ubiquitous_tokens);
const std::string &rule_name(const rules::Symbol &symbol) const;
const rules::rule_ptr &rule(const rules::Symbol &symbol) const;
const std::vector<std::pair<std::string, rules::rule_ptr>> rules;
const std::vector<std::pair<std::string, rules::rule_ptr>> aux_rules;
const std::vector<Production> &productions(const rules::Symbol &) const;
std::vector<std::pair<std::string, std::vector<Production>>> rules;
std::vector<std::pair<std::string, std::vector<Production>>> aux_rules;
std::set<rules::Symbol> ubiquitous_tokens;
};