Store ParseItemSets as maps, w/ core items as keys

ParseItem no longer has a lookahead_sym field; it now represents
the 'core' of a parse item. The lookahead context is stored separately,
as a set per core item. This makes iterating, copying and merging item
sets more efficient, because before, the core items were repeated for each
different lookahead symbol.

Also, the memoization in sym_transitions(ParseItemSet) has been removed.
Maybe I'll add it back later.
This commit is contained in:
Max Brunsfeld 2014-06-16 08:35:20 -07:00
parent d203c15911
commit 7a2c2c1c90
13 changed files with 121 additions and 147 deletions

View file

@ -19,9 +19,9 @@ describe("computing FOLLOW sets", []() {
ParseItem item(Symbol(2), choice({
seq({ i_sym(0), choice({ i_token(0), i_token(1) }) }),
seq({ i_sym(1), i_token(2) }),
}), 0, Symbol(10, SymbolOptionToken));
}), 0);
AssertThat(follow_sets(item, grammar), Equals(map<Symbol, set<Symbol>>({
AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map<Symbol, set<Symbol>>({
{ Symbol(0), set<Symbol>({
Symbol(0, SymbolOptionToken),
Symbol(1, SymbolOptionToken) }) },
@ -34,21 +34,21 @@ describe("computing FOLLOW sets", []() {
ParseItem item(Symbol(2), choice({
seq({ i_sym(0), choice({ i_token(0), i_token(1) }) }),
seq({ i_token(2), i_token(3) }),
}), 0, Symbol(10, SymbolOptionToken));
}), 0);
AssertThat(follow_sets(item, grammar), Equals(map<Symbol, set<Symbol>>({
AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map<Symbol, set<Symbol>>({
{ Symbol(0), set<Symbol>({
Symbol(0, SymbolOptionToken),
Symbol(1, SymbolOptionToken) }) },
})));
});
it("includes the item's lookahead terminal if the rule after the non-terminal might be blank", [&]() {
it("includes the item's lookahead symbol if the rule after the non-terminal might be blank", [&]() {
ParseItem item(Symbol(2), choice({
seq({ i_sym(0), choice({ i_token(0), blank() }) }),
}), 0, Symbol(10, SymbolOptionToken));
}), 0);
AssertThat(follow_sets(item, grammar), Equals(map<Symbol, set<Symbol>>({
AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map<Symbol, set<Symbol>>({
{ Symbol(0), set<Symbol>({
Symbol(0, SymbolOptionToken),
Symbol(10, SymbolOptionToken) }) },

View file

@ -19,13 +19,13 @@ describe("computing closures of item sets", []() {
}, {});
it("adds items at the beginnings of referenced rules", [&]() {
ParseItemSet item_set = item_set_closure(
ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0, Symbol(10, SymbolOptionToken)),
grammar);
ParseItemSet item_set = item_set_closure(ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0),
{ Symbol(10, SymbolOptionToken) },
grammar);
AssertThat(item_set, Equals(ParseItemSet({
ParseItem(Symbol(1), grammar.rule(Symbol(1)), 0, Symbol(11, SymbolOptionToken)),
ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0, Symbol(10, SymbolOptionToken)),
{ ParseItem(Symbol(1), grammar.rule(Symbol(1)), 0), { Symbol(11, SymbolOptionToken) } },
{ ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0), { Symbol(10, SymbolOptionToken) } },
})));
});
});

View file

@ -37,15 +37,15 @@ describe("syntactic item set transitions", [&]() {
it("computes the closure of the new item sets", [&]() {
ParseItemSet set1({
ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), 3, Symbol(23, SymbolOptionToken)),
{ ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), 3), { Symbol(23, SymbolOptionToken) } },
});
SymTransitions sym_transitions;
AssertThat(sym_transitions(set1, grammar), Equals(map<Symbol, ParseItemSet>({
{ Symbol(22, SymbolOptionToken), ParseItemSet({
ParseItem(Symbol(0), i_sym(1), 4, Symbol(23, SymbolOptionToken)),
ParseItem(Symbol(1), i_token(21), 0, Symbol(23, SymbolOptionToken))
{ ParseItem(Symbol(0), i_sym(1), 4), { Symbol(23, SymbolOptionToken) } },
{ ParseItem(Symbol(1), i_token(21), 0), { Symbol(23, SymbolOptionToken) } },
}) },
})));
});

View file

@ -46,14 +46,14 @@ namespace tree_sitter {
void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const auto &transition : sym_transitions(item_set, grammar)) {
const Symbol &symbol = transition.first;
const ParseItemSet &item_set = transition.second;
const ParseItemSet &next_item_set = transition.second;
auto &actions = parse_table.states[state_id].actions;
auto current_action = actions.find(symbol);
set<int> precedence_values = precedence_values_for_item_set(item_set);
set<int> precedence_values = precedence_values_for_item_set(next_item_set);
if (current_action == actions.end() ||
conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) {
ParseStateId new_state_id = add_parse_state(item_set);
ParseStateId new_state_id = add_parse_state(next_item_set);
parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values));
}
}
@ -68,17 +68,22 @@ namespace tree_sitter {
}
void add_reduce_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const ParseItem &item : item_set) {
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
if (item.is_done()) {
ParseAction action = (item.lhs == rules::START()) ?
ParseAction::Accept() :
ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence());
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(item.lookahead_sym);
if (current_action == current_actions.end() ||
conflict_manager.resolve_parse_action(item.lookahead_sym, current_action->second, action)) {
parse_table.add_action(state_id, item.lookahead_sym, action);
for (auto &lookahead_sym : lookahead_symbols) {
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(lookahead_sym);
if (current_action == current_actions.end() ||
conflict_manager.resolve_parse_action(lookahead_sym, current_action->second, action)) {
parse_table.add_action(state_id, lookahead_sym, action);
}
}
}
}
@ -86,9 +91,11 @@ namespace tree_sitter {
set<int> precedence_values_for_item_set(const ParseItemSet &item_set) {
set<int> result;
for (const auto &item : item_set)
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
if (item.consumed_symbol_count > 0)
result.insert(item.precedence());
}
return result;
}
@ -98,8 +105,8 @@ namespace tree_sitter {
conflict_manager(ParseConflictManager(grammar, lex_grammar)) {}
pair<ParseTable, vector<Conflict>> build() {
ParseItem start_item(rules::START(), make_shared<Symbol>(0), 0, rules::END_OF_INPUT());
add_parse_state(item_set_closure(start_item, grammar));
ParseItem start_item(rules::START(), make_shared<Symbol>(0), 0);
add_parse_state(item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
return { parse_table, conflict_manager.conflicts() };
}
};

View file

@ -60,11 +60,13 @@ namespace tree_sitter {
set<Symbol> first_set(const ParseItemSet &item_set, const PreparedGrammar &grammar) {
set<Symbol> result;
for (auto &item : item_set) {
auto &&rule_set = first_set(item.rule, grammar);
for (const auto &pair : item_set) {
const auto &item = pair.first;
const auto &lookahead_symbols = pair.second;
const auto &rule_set = first_set(item.rule, grammar);
result.insert(rule_set.begin(), rule_set.end());
if (rule_can_be_blank(item.rule, grammar))
result.insert(item.lookahead_sym);
result.insert(lookahead_symbols.begin(), lookahead_symbols.end());
}
return result;
}

View file

@ -12,7 +12,8 @@ namespace tree_sitter {
namespace build_tables {
map<Symbol, set<Symbol>> follow_sets(const ParseItem &item,
const PreparedGrammar &grammar) {
const set<Symbol> &lookahead_symbols,
const PreparedGrammar &grammar) {
map<Symbol, set<Symbol>> result;
for (auto &pair : sym_transitions(item.rule)) {
Symbol symbol = pair.first;
@ -20,7 +21,7 @@ namespace tree_sitter {
if (!symbol.is_token() && !symbol.is_built_in()) {
set<Symbol> following_terminals = first_set(next_rule, grammar);
if (rule_can_be_blank(next_rule, grammar))
following_terminals.insert(item.lookahead_sym);
following_terminals.insert(lookahead_symbols.begin(), lookahead_symbols.end());
result.insert({ symbol, following_terminals });
}
}

View file

@ -18,7 +18,7 @@ namespace tree_sitter {
* after the corresponding non-terminals.
*/
std::map<rules::Symbol, std::set<rules::Symbol>>
follow_sets(const ParseItem &item, const PreparedGrammar &grammar);
follow_sets(const ParseItem &item, const std::set<rules::Symbol> &lookahead_symbols, const PreparedGrammar &grammar);
}
}

View file

@ -1,6 +1,7 @@
#include "compiler/build_tables/item_set_closure.h"
#include <set>
#include <vector>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/build_tables/follow_sets.h"
#include "compiler/build_tables/item.h"
@ -10,27 +11,39 @@ namespace tree_sitter {
using std::set;
using rules::Symbol;
using std::vector;
using std::pair;
namespace build_tables {
const ParseItemSet item_set_closure(const ParseItem &item,
const ParseItemSet item_set_closure(const ParseItem &starting_item,
const set<Symbol> &starting_lookahead_symbols,
const PreparedGrammar &grammar) {
ParseItemSet result;
vector<ParseItem> items_to_add = { item };
while (!items_to_add.empty()) {
ParseItem item = items_to_add.back();
items_to_add.pop_back();
auto insertion_result = result.insert(item);
if (insertion_result.second) {
for (const auto &pair : follow_sets(item, grammar)) {
vector<pair<ParseItem, set<Symbol>>> pairs_to_add = { {starting_item, starting_lookahead_symbols} };
while (!pairs_to_add.empty()) {
auto pair = pairs_to_add.back();
pairs_to_add.pop_back();
auto &item = pair.first;
auto &lookahead_symbols = pair.second;
bool new_stuff_added = false;
auto &existing_lookahead_symbols = result[item];
for (auto &sym : lookahead_symbols) {
auto insertion_result = existing_lookahead_symbols.insert(sym);
if (insertion_result.second) new_stuff_added = true;
}
if (new_stuff_added) {
for (const auto &pair : follow_sets(item, lookahead_symbols, grammar)) {
const Symbol &non_terminal = pair.first;
const set<Symbol> &terminals = pair.second;
for (const auto &terminal : terminals) {
ParseItem next_item(non_terminal, grammar.rule(non_terminal), 0, terminal);
items_to_add.push_back(next_item);
}
pairs_to_add.push_back({
ParseItem(non_terminal, grammar.rule(non_terminal), 0),
terminals
});
}
}
}
return result;
}
}

View file

@ -1,6 +1,8 @@
#ifndef COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_
#define COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_
#include <set>
#include "compiler/rules/symbol.h"
#include "compiler/build_tables/parse_item.h"
namespace tree_sitter {
@ -8,6 +10,7 @@ namespace tree_sitter {
namespace build_tables {
const ParseItemSet item_set_closure(const ParseItem &item,
const std::set<rules::Symbol> &lookahead_symbols,
const PreparedGrammar &grammar);
}
}

View file

@ -1,6 +1,5 @@
#include "compiler/build_tables/item_set_transitions.h"
#include <unordered_set>
#include <vector>
#include <set>
#include "compiler/build_tables/item_set_closure.h"
#include "compiler/build_tables/rule_transitions.h"
#include "compiler/build_tables/merge_transitions.h"
@ -8,76 +7,34 @@
namespace tree_sitter {
using std::map;
using std::vector;
using std::unordered_set;
using std::set;
using rules::CharacterSet;
using rules::Symbol;
namespace build_tables {
template<typename T>
static void merge_sets(unordered_set<T> *left, const unordered_set<T> *right) {
left->insert(right->begin(), right->end());
}
const Symbol placeholder_lookahead = Symbol(-100);
const Symbol placeholder_lhs = Symbol(-101);
static map<Symbol, ParseItemSet> sym_transitions_for_rule(SymTransitions *self, const rules::rule_ptr &rule, const PreparedGrammar &grammar) {
auto pair = self->transitions_cache.find(rule);
if (pair != self->transitions_cache.end()) return pair->second;
map<Symbol, ParseItemSet> result;
for (auto &transition : sym_transitions(rule)) {
ParseItem new_item(placeholder_lhs, transition.second, 1, placeholder_lookahead);
result.insert({
transition.first,
item_set_closure(new_item, grammar)
});
}
self->transitions_cache.insert({ rule, result });
return result;
}
static map<Symbol, ParseItemSet> sym_transitions_for_item(SymTransitions *self, const ParseItem &item, const PreparedGrammar &grammar) {
auto result = sym_transitions_for_rule(self, item.rule, grammar);
for (auto &pair : result) {
vector<ParseItem> new_items;
auto &items = pair.second;
for (auto iter = items.begin(), end = items.end(); iter != end;) {
ParseItem new_item(*iter);
bool changed = false;
if (new_item.consumed_symbol_count > 0) {
new_item.consumed_symbol_count = item.consumed_symbol_count + 1;
changed = true;
}
if (new_item.lookahead_sym == placeholder_lookahead) {
new_item.lookahead_sym = item.lookahead_sym;
changed = true;
}
if (new_item.lhs == placeholder_lhs) {
new_item.lhs = item.lhs;
changed = true;
}
if (changed) {
iter = pair.second.erase(iter);
new_items.push_back(new_item);
} else {
++iter;
}
}
pair.second.insert(new_items.begin(), new_items.end());
}
return result;
}
map<Symbol, ParseItemSet>
SymTransitions::operator()(const ParseItemSet &item_set, const PreparedGrammar &grammar) {
map<Symbol, ParseItemSet> result;
for (const ParseItem &item : item_set)
merge_sym_transitions<ParseItemSet>(&result,
sym_transitions_for_item(this, item, grammar),
[](ParseItemSet *l, const ParseItemSet *r) {
merge_sets(l, r);
for (const auto &pair : item_set) {
const ParseItem &item = pair.first;
const set<Symbol> &lookahead_symbols = pair.second;
map<Symbol, ParseItemSet> result_for_item;
for (auto &transition : sym_transitions(item.rule)) {
ParseItem new_item(item.lhs, transition.second, item.consumed_symbol_count + 1);
result_for_item.insert({
transition.first,
item_set_closure(new_item, lookahead_symbols, grammar)
});
}
merge_sym_transitions<ParseItemSet>(&result, result_for_item,
[](ParseItemSet *left, const ParseItemSet *right) {
for (auto &pair : *right)
left->operator[](pair.first).insert(pair.second.begin(), pair.second.end());
});
}
return result;
}
@ -93,9 +50,10 @@ namespace tree_sitter {
LexItemSet({ next_item })
});
}
merge_char_transitions<LexItemSet>(&result, item_transitions, [](LexItemSet *l, const LexItemSet *r) {
merge_sets(l, r);
});
merge_char_transitions<LexItemSet>(&result, item_transitions,
[](LexItemSet *left, const LexItemSet *right) {
left->insert(right->begin(), right->end());
});
}
return result;
}

View file

@ -1,7 +1,10 @@
#include "compiler/build_tables/parse_item.h"
#include <set>
#include "tree_sitter/compiler.h"
namespace tree_sitter {
using std::pair;
using std::set;
using std::string;
using std::to_string;
using std::ostream;
@ -9,31 +12,19 @@ namespace tree_sitter {
namespace build_tables {
ParseItem::ParseItem(const rules::Symbol &lhs,
const rules::rule_ptr rule,
size_t consumed_symbol_count,
const rules::Symbol &lookahead_sym) :
size_t consumed_symbol_count) :
Item(lhs, rule),
consumed_symbol_count(consumed_symbol_count),
lookahead_sym(lookahead_sym) {}
consumed_symbol_count(consumed_symbol_count) {}
bool ParseItem::operator==(const ParseItem &other) const {
return
(other.lhs == lhs) &&
(other.consumed_symbol_count == consumed_symbol_count) &&
(other.lookahead_sym == lookahead_sym) &&
(other.rule == rule || other.rule->operator==(*rule));
(lhs == other.lhs) &&
(consumed_symbol_count == other.consumed_symbol_count) &&
(rule == other.rule || rule->operator==(*other.rule));
}
ostream& operator<<(ostream &stream, const ParseItem &item) {
return stream <<
string("#<item ") <<
item.lhs <<
string(" ") <<
*item.rule <<
string(" ") <<
to_string(item.consumed_symbol_count) <<
string(" ") <<
item.lookahead_sym <<
string(">");
return stream << string("#<item ") << item.lhs << string(" ") << *item.rule << string(">");
}
}
}

View file

@ -1,8 +1,9 @@
#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_H_
#define COMPILER_BUILD_TABLES_PARSE_ITEM_H_
#include <unordered_set>
#include <set>
#include <string>
#include <unordered_map>
#include "compiler/rules/symbol.h"
#include "compiler/build_tables/item.h"
@ -10,19 +11,14 @@ namespace tree_sitter {
namespace build_tables {
class ParseItem : public Item {
public:
ParseItem(const rules::Symbol &lhs,
rules::rule_ptr rule,
const size_t consumed_symbol_count,
const rules::Symbol &lookahead_sym);
ParseItem(const rules::Symbol &lhs, rules::rule_ptr rule, const size_t consumed_symbol_count);
bool operator==(const ParseItem &other) const;
size_t consumed_symbol_count;
rules::Symbol lookahead_sym;
};
std::ostream& operator<<(std::ostream &stream, const ParseItem &item);
typedef std::unordered_set<ParseItem> ParseItemSet;
typedef std::unordered_map<ParseItem, std::set<rules::Symbol>> ParseItemSet;
}
}
@ -31,10 +27,9 @@ namespace std {
struct hash<tree_sitter::build_tables::ParseItem> {
size_t operator()(const tree_sitter::build_tables::ParseItem &item) const {
return
hash<tree_sitter::rules::Symbol>()(item.lhs) ^
hash<tree_sitter::rules::rule_ptr>()(item.rule) ^
hash<size_t>()(item.consumed_symbol_count) ^
hash<tree_sitter::rules::Symbol>()(item.lookahead_sym);
hash<tree_sitter::rules::Symbol>()(item.lhs) ^
hash<tree_sitter::rules::rule_ptr>()(item.rule) ^
hash<size_t>()(item.consumed_symbol_count);
}
};
@ -42,8 +37,12 @@ namespace std {
struct hash<const tree_sitter::build_tables::ParseItemSet> {
size_t operator()(const tree_sitter::build_tables::ParseItemSet &set) const {
size_t result = hash<size_t>()(set.size());
for (auto item : set)
result ^= hash<tree_sitter::build_tables::ParseItem>()(item);
for (auto &pair : set) {
result ^= hash<tree_sitter::build_tables::ParseItem>()(pair.first);
result ^= hash<size_t>()(pair.second.size());
for (auto &symbol : pair.second)
result ^= hash<tree_sitter::rules::Symbol>()(symbol);
}
return result;
}
};

View file

@ -37,7 +37,7 @@ namespace tree_sitter {
string Symbol::to_string() const {
string name = (options & SymbolOptionAuxiliary) ? "aux_" : "";
name += (options & SymbolOptionToken) ? "token" : "sym";
return "#<" + name + std::to_string(index) + ">";
return "#<" + name + " " + std::to_string(index) + ">";
}
bool Symbol::operator<(const Symbol &other) const {