From 7a2c2c1c904321a1d5e28f3f9298b2356fbc693b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 16 Jun 2014 08:35:20 -0700 Subject: [PATCH] Store ParseItemSets as maps, w/ core items as keys ParseItem no longer has a lookahead_sym field; it now represents the 'core' of a parse item. The lookahead context is stored separately, as a set per core item. This makes iterating, copying and merging item sets more efficient, because before, the core items were repeated for each different lookahead symbol. Also, the memoization in sym_transitions(ParseItemSet) has been removed. Maybe I'll add it back later. --- .../compiler/build_tables/follow_sets_spec.cc | 14 +-- .../build_tables/item_set_closure_spec.cc | 10 +- .../build_tables/item_set_transitions_spec.cc | 6 +- .../build_tables/build_parse_table.cc | 33 ++++--- src/compiler/build_tables/first_set.cc | 8 +- src/compiler/build_tables/follow_sets.cc | 5 +- src/compiler/build_tables/follow_sets.h | 2 +- src/compiler/build_tables/item_set_closure.cc | 37 +++++--- src/compiler/build_tables/item_set_closure.h | 3 + .../build_tables/item_set_transitions.cc | 92 +++++-------------- src/compiler/build_tables/parse_item.cc | 29 ++---- src/compiler/build_tables/parse_item.h | 27 +++--- src/compiler/rules/symbol.cc | 2 +- 13 files changed, 121 insertions(+), 147 deletions(-) diff --git a/spec/compiler/build_tables/follow_sets_spec.cc b/spec/compiler/build_tables/follow_sets_spec.cc index 02379cff..2c0d21e9 100644 --- a/spec/compiler/build_tables/follow_sets_spec.cc +++ b/spec/compiler/build_tables/follow_sets_spec.cc @@ -19,9 +19,9 @@ describe("computing FOLLOW sets", []() { ParseItem item(Symbol(2), choice({ seq({ i_sym(0), choice({ i_token(0), i_token(1) }) }), seq({ i_sym(1), i_token(2) }), - }), 0, Symbol(10, SymbolOptionToken)); + }), 0); - AssertThat(follow_sets(item, grammar), Equals(map>({ + AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map>({ { Symbol(0), set({ Symbol(0, SymbolOptionToken), Symbol(1, SymbolOptionToken) }) }, @@ -34,21 +34,21 @@ describe("computing FOLLOW sets", []() { ParseItem item(Symbol(2), choice({ seq({ i_sym(0), choice({ i_token(0), i_token(1) }) }), seq({ i_token(2), i_token(3) }), - }), 0, Symbol(10, SymbolOptionToken)); + }), 0); - AssertThat(follow_sets(item, grammar), Equals(map>({ + AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map>({ { Symbol(0), set({ Symbol(0, SymbolOptionToken), Symbol(1, SymbolOptionToken) }) }, }))); }); - it("includes the item's lookahead terminal if the rule after the non-terminal might be blank", [&]() { + it("includes the item's lookahead symbol if the rule after the non-terminal might be blank", [&]() { ParseItem item(Symbol(2), choice({ seq({ i_sym(0), choice({ i_token(0), blank() }) }), - }), 0, Symbol(10, SymbolOptionToken)); + }), 0); - AssertThat(follow_sets(item, grammar), Equals(map>({ + AssertThat(follow_sets(item, { Symbol(10, SymbolOptionToken) }, grammar), Equals(map>({ { Symbol(0), set({ Symbol(0, SymbolOptionToken), Symbol(10, SymbolOptionToken) }) }, diff --git a/spec/compiler/build_tables/item_set_closure_spec.cc b/spec/compiler/build_tables/item_set_closure_spec.cc index e8ef8734..de4ce5e0 100644 --- a/spec/compiler/build_tables/item_set_closure_spec.cc +++ b/spec/compiler/build_tables/item_set_closure_spec.cc @@ -19,13 +19,13 @@ describe("computing closures of item sets", []() { }, {}); it("adds items at the beginnings of referenced rules", [&]() { - ParseItemSet item_set = item_set_closure( - ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0, Symbol(10, SymbolOptionToken)), - grammar); + ParseItemSet item_set = item_set_closure(ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0), + { Symbol(10, SymbolOptionToken) }, + grammar); AssertThat(item_set, Equals(ParseItemSet({ - ParseItem(Symbol(1), grammar.rule(Symbol(1)), 0, Symbol(11, SymbolOptionToken)), - ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0, Symbol(10, SymbolOptionToken)), + { ParseItem(Symbol(1), grammar.rule(Symbol(1)), 0), { Symbol(11, SymbolOptionToken) } }, + { ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0), { Symbol(10, SymbolOptionToken) } }, }))); }); }); diff --git a/spec/compiler/build_tables/item_set_transitions_spec.cc b/spec/compiler/build_tables/item_set_transitions_spec.cc index 84c6c1bd..a6262ef1 100644 --- a/spec/compiler/build_tables/item_set_transitions_spec.cc +++ b/spec/compiler/build_tables/item_set_transitions_spec.cc @@ -37,15 +37,15 @@ describe("syntactic item set transitions", [&]() { it("computes the closure of the new item sets", [&]() { ParseItemSet set1({ - ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), 3, Symbol(23, SymbolOptionToken)), + { ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), 3), { Symbol(23, SymbolOptionToken) } }, }); SymTransitions sym_transitions; AssertThat(sym_transitions(set1, grammar), Equals(map({ { Symbol(22, SymbolOptionToken), ParseItemSet({ - ParseItem(Symbol(0), i_sym(1), 4, Symbol(23, SymbolOptionToken)), - ParseItem(Symbol(1), i_token(21), 0, Symbol(23, SymbolOptionToken)) + { ParseItem(Symbol(0), i_sym(1), 4), { Symbol(23, SymbolOptionToken) } }, + { ParseItem(Symbol(1), i_token(21), 0), { Symbol(23, SymbolOptionToken) } }, }) }, }))); }); diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index fdca019e..90faf0a1 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -46,14 +46,14 @@ namespace tree_sitter { void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) { for (const auto &transition : sym_transitions(item_set, grammar)) { const Symbol &symbol = transition.first; - const ParseItemSet &item_set = transition.second; + const ParseItemSet &next_item_set = transition.second; auto &actions = parse_table.states[state_id].actions; auto current_action = actions.find(symbol); - set precedence_values = precedence_values_for_item_set(item_set); + set precedence_values = precedence_values_for_item_set(next_item_set); if (current_action == actions.end() || conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) { - ParseStateId new_state_id = add_parse_state(item_set); + ParseStateId new_state_id = add_parse_state(next_item_set); parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values)); } } @@ -68,17 +68,22 @@ namespace tree_sitter { } void add_reduce_actions(const ParseItemSet &item_set, ParseStateId state_id) { - for (const ParseItem &item : item_set) { + for (const auto &pair : item_set) { + const ParseItem &item = pair.first; + const set &lookahead_symbols = pair.second; + if (item.is_done()) { ParseAction action = (item.lhs == rules::START()) ? ParseAction::Accept() : ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence()); - auto current_actions = parse_table.states[state_id].actions; - auto current_action = current_actions.find(item.lookahead_sym); - - if (current_action == current_actions.end() || - conflict_manager.resolve_parse_action(item.lookahead_sym, current_action->second, action)) { - parse_table.add_action(state_id, item.lookahead_sym, action); + + for (auto &lookahead_sym : lookahead_symbols) { + auto current_actions = parse_table.states[state_id].actions; + auto current_action = current_actions.find(lookahead_sym); + if (current_action == current_actions.end() || + conflict_manager.resolve_parse_action(lookahead_sym, current_action->second, action)) { + parse_table.add_action(state_id, lookahead_sym, action); + } } } } @@ -86,9 +91,11 @@ namespace tree_sitter { set precedence_values_for_item_set(const ParseItemSet &item_set) { set result; - for (const auto &item : item_set) + for (const auto &pair : item_set) { + const ParseItem &item = pair.first; if (item.consumed_symbol_count > 0) result.insert(item.precedence()); + } return result; } @@ -98,8 +105,8 @@ namespace tree_sitter { conflict_manager(ParseConflictManager(grammar, lex_grammar)) {} pair> build() { - ParseItem start_item(rules::START(), make_shared(0), 0, rules::END_OF_INPUT()); - add_parse_state(item_set_closure(start_item, grammar)); + ParseItem start_item(rules::START(), make_shared(0), 0); + add_parse_state(item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar)); return { parse_table, conflict_manager.conflicts() }; } }; diff --git a/src/compiler/build_tables/first_set.cc b/src/compiler/build_tables/first_set.cc index b7c56609..39558197 100644 --- a/src/compiler/build_tables/first_set.cc +++ b/src/compiler/build_tables/first_set.cc @@ -60,11 +60,13 @@ namespace tree_sitter { set first_set(const ParseItemSet &item_set, const PreparedGrammar &grammar) { set result; - for (auto &item : item_set) { - auto &&rule_set = first_set(item.rule, grammar); + for (const auto &pair : item_set) { + const auto &item = pair.first; + const auto &lookahead_symbols = pair.second; + const auto &rule_set = first_set(item.rule, grammar); result.insert(rule_set.begin(), rule_set.end()); if (rule_can_be_blank(item.rule, grammar)) - result.insert(item.lookahead_sym); + result.insert(lookahead_symbols.begin(), lookahead_symbols.end()); } return result; } diff --git a/src/compiler/build_tables/follow_sets.cc b/src/compiler/build_tables/follow_sets.cc index 1384ab17..76a701b6 100644 --- a/src/compiler/build_tables/follow_sets.cc +++ b/src/compiler/build_tables/follow_sets.cc @@ -12,7 +12,8 @@ namespace tree_sitter { namespace build_tables { map> follow_sets(const ParseItem &item, - const PreparedGrammar &grammar) { + const set &lookahead_symbols, + const PreparedGrammar &grammar) { map> result; for (auto &pair : sym_transitions(item.rule)) { Symbol symbol = pair.first; @@ -20,7 +21,7 @@ namespace tree_sitter { if (!symbol.is_token() && !symbol.is_built_in()) { set following_terminals = first_set(next_rule, grammar); if (rule_can_be_blank(next_rule, grammar)) - following_terminals.insert(item.lookahead_sym); + following_terminals.insert(lookahead_symbols.begin(), lookahead_symbols.end()); result.insert({ symbol, following_terminals }); } } diff --git a/src/compiler/build_tables/follow_sets.h b/src/compiler/build_tables/follow_sets.h index 772b16f8..f55c24ba 100644 --- a/src/compiler/build_tables/follow_sets.h +++ b/src/compiler/build_tables/follow_sets.h @@ -18,7 +18,7 @@ namespace tree_sitter { * after the corresponding non-terminals. */ std::map> - follow_sets(const ParseItem &item, const PreparedGrammar &grammar); + follow_sets(const ParseItem &item, const std::set &lookahead_symbols, const PreparedGrammar &grammar); } } diff --git a/src/compiler/build_tables/item_set_closure.cc b/src/compiler/build_tables/item_set_closure.cc index 0f0c5f77..141b2bb5 100644 --- a/src/compiler/build_tables/item_set_closure.cc +++ b/src/compiler/build_tables/item_set_closure.cc @@ -1,6 +1,7 @@ #include "compiler/build_tables/item_set_closure.h" #include #include +#include #include "tree_sitter/compiler.h" #include "compiler/build_tables/follow_sets.h" #include "compiler/build_tables/item.h" @@ -10,27 +11,39 @@ namespace tree_sitter { using std::set; using rules::Symbol; using std::vector; + using std::pair; namespace build_tables { - const ParseItemSet item_set_closure(const ParseItem &item, + const ParseItemSet item_set_closure(const ParseItem &starting_item, + const set &starting_lookahead_symbols, const PreparedGrammar &grammar) { ParseItemSet result; - vector items_to_add = { item }; - while (!items_to_add.empty()) { - ParseItem item = items_to_add.back(); - items_to_add.pop_back(); - auto insertion_result = result.insert(item); - if (insertion_result.second) { - for (const auto &pair : follow_sets(item, grammar)) { + vector>> pairs_to_add = { {starting_item, starting_lookahead_symbols} }; + while (!pairs_to_add.empty()) { + auto pair = pairs_to_add.back(); + pairs_to_add.pop_back(); + auto &item = pair.first; + auto &lookahead_symbols = pair.second; + + bool new_stuff_added = false; + auto &existing_lookahead_symbols = result[item]; + for (auto &sym : lookahead_symbols) { + auto insertion_result = existing_lookahead_symbols.insert(sym); + if (insertion_result.second) new_stuff_added = true; + } + + if (new_stuff_added) { + for (const auto &pair : follow_sets(item, lookahead_symbols, grammar)) { const Symbol &non_terminal = pair.first; const set &terminals = pair.second; - for (const auto &terminal : terminals) { - ParseItem next_item(non_terminal, grammar.rule(non_terminal), 0, terminal); - items_to_add.push_back(next_item); - } + pairs_to_add.push_back({ + ParseItem(non_terminal, grammar.rule(non_terminal), 0), + terminals + }); } } } + return result; } } diff --git a/src/compiler/build_tables/item_set_closure.h b/src/compiler/build_tables/item_set_closure.h index dde19ff5..b25a2869 100644 --- a/src/compiler/build_tables/item_set_closure.h +++ b/src/compiler/build_tables/item_set_closure.h @@ -1,6 +1,8 @@ #ifndef COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ #define COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ +#include +#include "compiler/rules/symbol.h" #include "compiler/build_tables/parse_item.h" namespace tree_sitter { @@ -8,6 +10,7 @@ namespace tree_sitter { namespace build_tables { const ParseItemSet item_set_closure(const ParseItem &item, + const std::set &lookahead_symbols, const PreparedGrammar &grammar); } } diff --git a/src/compiler/build_tables/item_set_transitions.cc b/src/compiler/build_tables/item_set_transitions.cc index 215ee9d7..6c44a23a 100644 --- a/src/compiler/build_tables/item_set_transitions.cc +++ b/src/compiler/build_tables/item_set_transitions.cc @@ -1,6 +1,5 @@ #include "compiler/build_tables/item_set_transitions.h" -#include -#include +#include #include "compiler/build_tables/item_set_closure.h" #include "compiler/build_tables/rule_transitions.h" #include "compiler/build_tables/merge_transitions.h" @@ -8,76 +7,34 @@ namespace tree_sitter { using std::map; - using std::vector; - using std::unordered_set; + using std::set; using rules::CharacterSet; using rules::Symbol; namespace build_tables { - template - static void merge_sets(unordered_set *left, const unordered_set *right) { - left->insert(right->begin(), right->end()); - } - - const Symbol placeholder_lookahead = Symbol(-100); - const Symbol placeholder_lhs = Symbol(-101); - - static map sym_transitions_for_rule(SymTransitions *self, const rules::rule_ptr &rule, const PreparedGrammar &grammar) { - auto pair = self->transitions_cache.find(rule); - if (pair != self->transitions_cache.end()) return pair->second; - map result; - for (auto &transition : sym_transitions(rule)) { - ParseItem new_item(placeholder_lhs, transition.second, 1, placeholder_lookahead); - result.insert({ - transition.first, - item_set_closure(new_item, grammar) - }); - } - self->transitions_cache.insert({ rule, result }); - return result; - } - - static map sym_transitions_for_item(SymTransitions *self, const ParseItem &item, const PreparedGrammar &grammar) { - auto result = sym_transitions_for_rule(self, item.rule, grammar); - for (auto &pair : result) { - vector new_items; - auto &items = pair.second; - for (auto iter = items.begin(), end = items.end(); iter != end;) { - ParseItem new_item(*iter); - bool changed = false; - if (new_item.consumed_symbol_count > 0) { - new_item.consumed_symbol_count = item.consumed_symbol_count + 1; - changed = true; - } - if (new_item.lookahead_sym == placeholder_lookahead) { - new_item.lookahead_sym = item.lookahead_sym; - changed = true; - } - if (new_item.lhs == placeholder_lhs) { - new_item.lhs = item.lhs; - changed = true; - } - if (changed) { - iter = pair.second.erase(iter); - new_items.push_back(new_item); - } else { - ++iter; - } - } - pair.second.insert(new_items.begin(), new_items.end()); - } - return result; - } - map SymTransitions::operator()(const ParseItemSet &item_set, const PreparedGrammar &grammar) { map result; - for (const ParseItem &item : item_set) - merge_sym_transitions(&result, - sym_transitions_for_item(this, item, grammar), - [](ParseItemSet *l, const ParseItemSet *r) { - merge_sets(l, r); + + for (const auto &pair : item_set) { + const ParseItem &item = pair.first; + const set &lookahead_symbols = pair.second; + map result_for_item; + for (auto &transition : sym_transitions(item.rule)) { + ParseItem new_item(item.lhs, transition.second, item.consumed_symbol_count + 1); + result_for_item.insert({ + transition.first, + item_set_closure(new_item, lookahead_symbols, grammar) + }); + } + + merge_sym_transitions(&result, result_for_item, + [](ParseItemSet *left, const ParseItemSet *right) { + for (auto &pair : *right) + left->operator[](pair.first).insert(pair.second.begin(), pair.second.end()); }); + } + return result; } @@ -93,9 +50,10 @@ namespace tree_sitter { LexItemSet({ next_item }) }); } - merge_char_transitions(&result, item_transitions, [](LexItemSet *l, const LexItemSet *r) { - merge_sets(l, r); - }); + merge_char_transitions(&result, item_transitions, + [](LexItemSet *left, const LexItemSet *right) { + left->insert(right->begin(), right->end()); + }); } return result; } diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 2b92f2be..280bf620 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -1,7 +1,10 @@ #include "compiler/build_tables/parse_item.h" +#include #include "tree_sitter/compiler.h" namespace tree_sitter { + using std::pair; + using std::set; using std::string; using std::to_string; using std::ostream; @@ -9,31 +12,19 @@ namespace tree_sitter { namespace build_tables { ParseItem::ParseItem(const rules::Symbol &lhs, const rules::rule_ptr rule, - size_t consumed_symbol_count, - const rules::Symbol &lookahead_sym) : + size_t consumed_symbol_count) : Item(lhs, rule), - consumed_symbol_count(consumed_symbol_count), - lookahead_sym(lookahead_sym) {} - + consumed_symbol_count(consumed_symbol_count) {} + bool ParseItem::operator==(const ParseItem &other) const { return - (other.lhs == lhs) && - (other.consumed_symbol_count == consumed_symbol_count) && - (other.lookahead_sym == lookahead_sym) && - (other.rule == rule || other.rule->operator==(*rule)); + (lhs == other.lhs) && + (consumed_symbol_count == other.consumed_symbol_count) && + (rule == other.rule || rule->operator==(*other.rule)); } ostream& operator<<(ostream &stream, const ParseItem &item) { - return stream << - string("#"); + return stream << string("#"); } } } diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index 4d0de966..2c638885 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -1,8 +1,9 @@ #ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_H_ #define COMPILER_BUILD_TABLES_PARSE_ITEM_H_ -#include +#include #include +#include #include "compiler/rules/symbol.h" #include "compiler/build_tables/item.h" @@ -10,19 +11,14 @@ namespace tree_sitter { namespace build_tables { class ParseItem : public Item { public: - ParseItem(const rules::Symbol &lhs, - rules::rule_ptr rule, - const size_t consumed_symbol_count, - const rules::Symbol &lookahead_sym); + ParseItem(const rules::Symbol &lhs, rules::rule_ptr rule, const size_t consumed_symbol_count); bool operator==(const ParseItem &other) const; - size_t consumed_symbol_count; - rules::Symbol lookahead_sym; }; std::ostream& operator<<(std::ostream &stream, const ParseItem &item); - typedef std::unordered_set ParseItemSet; + typedef std::unordered_map> ParseItemSet; } } @@ -31,10 +27,9 @@ namespace std { struct hash { size_t operator()(const tree_sitter::build_tables::ParseItem &item) const { return - hash()(item.lhs) ^ - hash()(item.rule) ^ - hash()(item.consumed_symbol_count) ^ - hash()(item.lookahead_sym); + hash()(item.lhs) ^ + hash()(item.rule) ^ + hash()(item.consumed_symbol_count); } }; @@ -42,8 +37,12 @@ namespace std { struct hash { size_t operator()(const tree_sitter::build_tables::ParseItemSet &set) const { size_t result = hash()(set.size()); - for (auto item : set) - result ^= hash()(item); + for (auto &pair : set) { + result ^= hash()(pair.first); + result ^= hash()(pair.second.size()); + for (auto &symbol : pair.second) + result ^= hash()(symbol); + } return result; } }; diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index aa27a33b..3e0337ff 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -37,7 +37,7 @@ namespace tree_sitter { string Symbol::to_string() const { string name = (options & SymbolOptionAuxiliary) ? "aux_" : ""; name += (options & SymbolOptionToken) ? "token" : "sym"; - return "#<" + name + std::to_string(index) + ">"; + return "#<" + name + " " + std::to_string(index) + ">"; } bool Symbol::operator<(const Symbol &other) const {