From 06215607d13773fdb4762012ca566b05543b9ba2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 19 Nov 2016 22:35:23 -0800 Subject: [PATCH 1/2] Precompute transitive closure contributions by grammar symbol --- spec/helpers/stream_methods.cc | 6 +- .../build_tables/parse_item_set_builder.cc | 155 +++++++++++------- .../build_tables/parse_item_set_builder.h | 4 +- 3 files changed, 105 insertions(+), 60 deletions(-) diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index cf35512d..4d411d66 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -132,7 +132,11 @@ std::ostream &operator<<(std::ostream &stream, const ParseItemSet &item_set) { } std::ostream &operator<<(std::ostream &stream, const LookaheadSet &set) { - return stream << *set.entries; + if (set.entries.get()) { + return stream << *set.entries; + } else { + return stream << "{}"; + } } } // namespace build_tables diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 45406f88..24408e6d 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -13,6 +13,7 @@ using std::vector; using std::set; using std::map; using std::get; +using std::pair; using std::tuple; using std::make_tuple; using std::shared_ptr; @@ -20,28 +21,36 @@ using std::make_shared; using rules::Symbol; using rules::NONE; -ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, - const LexicalGrammar &lexical_grammar) : - grammar{&grammar} { - vector symbol_stack; - set processed_symbols; +static Symbol::Index PROPAGATE = -5; - for (size_t i = 0; i < grammar.variables.size(); i++) { +ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, + const LexicalGrammar &lexical_grammar) { + vector> items_to_process; + vector symbols_to_process; + set processed_non_terminals; + + for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + Symbol symbol(i, true); + first_sets.insert({symbol, LookaheadSet({ static_cast(i) })}); + } + + for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol(i); LookaheadSet first_set; - processed_symbols.clear(); - symbol_stack.clear(); - symbol_stack.push_back(symbol); - while (!symbol_stack.empty()) { - Symbol current_symbol = symbol_stack.back(); - symbol_stack.pop_back(); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push_back(symbol); + while (!symbols_to_process.empty()) { + Symbol current_symbol = symbols_to_process.back(); + symbols_to_process.pop_back(); + if (current_symbol.is_token) { first_set.insert(current_symbol.index); - } else if (processed_symbols.insert(current_symbol).second) { + } else if (processed_non_terminals.insert(current_symbol.index).second) { for (const Production &production : grammar.productions(current_symbol)) { if (!production.empty()) { - symbol_stack.push_back(production[0].symbol); + symbols_to_process.push_back(production[0].symbol); } } } @@ -50,55 +59,87 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, first_sets.insert({symbol, first_set}); } - for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { - Symbol symbol(i, true); - first_sets.insert({symbol, LookaheadSet({ static_cast(i) })}); + for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { + Symbol symbol(i); + ParseItemSet item_set; + + items_to_process.clear(); + for (const Production &production : grammar.productions(symbol)) { + items_to_process.push_back({ + ParseItem(symbol, production, 0), + LookaheadSet({ PROPAGATE }), + }); + } + + while (!items_to_process.empty()) { + ParseItem item = items_to_process.back().first; + LookaheadSet lookaheads = items_to_process.back().second; + items_to_process.pop_back(); + + if (item_set.entries[item].insert_all(lookaheads)) { + Symbol next_symbol = item.next_symbol(); + if (next_symbol.is_built_in() || next_symbol.is_token) + continue; + + LookaheadSet next_lookaheads; + size_t next_step = item.step_index + 1; + if (next_step == item.production->size()) { + next_lookaheads = lookaheads; + } else { + Symbol symbol_after_next = item.production->at(next_step).symbol; + next_lookaheads = first_sets.find(symbol_after_next)->second; + } + + for (const Production &production : grammar.productions(next_symbol)) { + items_to_process.push_back({ + ParseItem(next_symbol, production, 0), + next_lookaheads, + }); + } + } + } + + cached_item_sets.insert({symbol.index, item_set}); } } void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { - items_to_process.clear(); - for (const auto &entry : item_set->entries) { - items_to_process.push_back(make_tuple(entry.first, entry.second, true)); + item_set_buffer.clear(); + for (const auto &pair : item_set->entries) { + const ParseItem &item = pair.first; + const LookaheadSet &lookaheads = pair.second; + + const Symbol &next_symbol = item.next_symbol(); + if (!next_symbol.is_token && !next_symbol.is_built_in()) { + LookaheadSet next_lookaheads; + size_t next_step = item.step_index + 1; + if (next_step == item.production->size()) { + next_lookaheads = lookaheads; + } else { + Symbol symbol_after_next = item.production->at(next_step).symbol; + next_lookaheads = first_sets.find(symbol_after_next)->second; + } + + for (const auto &cached_pair : cached_item_sets[next_symbol.index].entries) { + const ParseItem &cached_item = cached_pair.first; + const LookaheadSet &cached_lookaheads = cached_pair.second; + + LookaheadSet new_lookaheads; + for (auto entry : *cached_lookaheads.entries) { + if (entry == PROPAGATE) { + new_lookaheads.insert_all(next_lookaheads); + } else { + new_lookaheads.insert(entry); + } + } + + item_set_buffer.push_back({cached_item, new_lookaheads}); + } + } } - while (!items_to_process.empty()) { - ParseItem item = get<0>(items_to_process.back()); - LookaheadSet lookahead_symbols = get<1>(items_to_process.back()); - bool from_original_set = get<2>(items_to_process.back()); - items_to_process.pop_back(); - - // Add the parse-item and lookahead symbols to the item set. - // If they were already present, skip to the next item. - if (!from_original_set && !item_set->entries[item].insert_all(lookahead_symbols)) - continue; - - // If the next symbol in the production is not a non-terminal, skip to the - // next item. - Symbol next_symbol = item.next_symbol(); - if (next_symbol == NONE() || next_symbol.is_token || - next_symbol.is_built_in()) - continue; - - // If the next symbol is the last symbol in the item's production, then the - // lookahead symbols for the new items are the same as for the current item. - // Otherwise, they are the FOLLOW set of the symbol in this production. - LookaheadSet next_lookahead_symbols; - size_t next_step = item.step_index + 1; - if (next_step == item.production->size()) { - next_lookahead_symbols = lookahead_symbols; - } else { - Symbol symbol_after_next = item.production->at(next_step).symbol; - next_lookahead_symbols = first_sets.find(symbol_after_next)->second; - } - - // Add each of the next symbol's productions to be processed recursively. - for (const Production &production : grammar->productions(next_symbol)) - items_to_process.push_back(make_tuple( - ParseItem(next_symbol, production, 0), - next_lookahead_symbols, - false - )); + for (const auto &pair : item_set_buffer) { + item_set->entries[pair.first].insert_all(pair.second); } } diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index db3ca930..6279798a 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -13,9 +13,9 @@ struct LexicalGrammar; namespace build_tables { class ParseItemSetBuilder { - const SyntaxGrammar *grammar; std::map first_sets; - std::vector> items_to_process; + std::map cached_item_sets; + std::vector> item_set_buffer; public: ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); From 101e304a8aa79f4b8d57926167434d267b4dad30 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 20 Nov 2016 20:35:26 -0800 Subject: [PATCH 2/2] Avoid unnecessary lookahead set mutations in ParseItemSetBuilder --- .../build_tables/parse_item_set_builder.cc | 69 +++++++++++-------- .../build_tables/parse_item_set_builder.h | 8 ++- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 24408e6d..34b347fe 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -21,11 +21,8 @@ using std::make_shared; using rules::Symbol; using rules::NONE; -static Symbol::Index PROPAGATE = -5; - ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, const LexicalGrammar &lexical_grammar) { - vector> items_to_process; vector symbols_to_process; set processed_non_terminals; @@ -59,52 +56,75 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, first_sets.insert({symbol, first_set}); } + vector components_to_process; + for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol(i); - ParseItemSet item_set; + map> cache_entry; - items_to_process.clear(); + components_to_process.clear(); for (const Production &production : grammar.productions(symbol)) { - items_to_process.push_back({ + components_to_process.push_back(ParseItemSetComponent{ ParseItem(symbol, production, 0), - LookaheadSet({ PROPAGATE }), + LookaheadSet(), + true }); } - while (!items_to_process.empty()) { - ParseItem item = items_to_process.back().first; - LookaheadSet lookaheads = items_to_process.back().second; - items_to_process.pop_back(); + while (!components_to_process.empty()) { + ParseItemSetComponent component = components_to_process.back(); + ParseItem &item = component.item; + LookaheadSet &lookaheads = component.lookaheads; + components_to_process.pop_back(); - if (item_set.entries[item].insert_all(lookaheads)) { + bool component_is_new; + if (component.propagates_lookaheads) { + component_is_new = !cache_entry[item].second; + cache_entry[item].second = true; + } else { + component_is_new = cache_entry[item].first.insert_all(lookaheads); + } + + if (component_is_new) { Symbol next_symbol = item.next_symbol(); if (next_symbol.is_built_in() || next_symbol.is_token) continue; LookaheadSet next_lookaheads; + bool propagates_lookaheads; size_t next_step = item.step_index + 1; if (next_step == item.production->size()) { next_lookaheads = lookaheads; + propagates_lookaheads = component.propagates_lookaheads; } else { Symbol symbol_after_next = item.production->at(next_step).symbol; next_lookaheads = first_sets.find(symbol_after_next)->second; + propagates_lookaheads = false; } for (const Production &production : grammar.productions(next_symbol)) { - items_to_process.push_back({ + components_to_process.push_back(ParseItemSetComponent{ ParseItem(next_symbol, production, 0), next_lookaheads, + propagates_lookaheads }); } } } - cached_item_sets.insert({symbol.index, item_set}); + for (auto &pair : cache_entry) { + component_cache[symbol.index].push_back(ParseItemSetComponent{ + pair.first, + pair.second.first, + pair.second.second + }); + } } } void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { item_set_buffer.clear(); + for (const auto &pair : item_set->entries) { const ParseItem &item = pair.first; const LookaheadSet &lookaheads = pair.second; @@ -120,26 +140,17 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { next_lookaheads = first_sets.find(symbol_after_next)->second; } - for (const auto &cached_pair : cached_item_sets[next_symbol.index].entries) { - const ParseItem &cached_item = cached_pair.first; - const LookaheadSet &cached_lookaheads = cached_pair.second; - - LookaheadSet new_lookaheads; - for (auto entry : *cached_lookaheads.entries) { - if (entry == PROPAGATE) { - new_lookaheads.insert_all(next_lookaheads); - } else { - new_lookaheads.insert(entry); - } + for (const ParseItemSetComponent &component : component_cache[next_symbol.index]) { + item_set_buffer.push_back({component.item, component.lookaheads}); + if (component.propagates_lookaheads) { + item_set_buffer.push_back({component.item, next_lookaheads}); } - - item_set_buffer.push_back({cached_item, new_lookaheads}); } } } - for (const auto &pair : item_set_buffer) { - item_set->entries[pair.first].insert_all(pair.second); + for (const auto &buffer_entry : item_set_buffer) { + item_set->entries[buffer_entry.first].insert_all(buffer_entry.second); } } diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index 6279798a..2a0de268 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -13,8 +13,14 @@ struct LexicalGrammar; namespace build_tables { class ParseItemSetBuilder { + struct ParseItemSetComponent { + ParseItem item; + LookaheadSet lookaheads; + bool propagates_lookaheads; + }; + std::map first_sets; - std::map cached_item_sets; + std::map> component_cache; std::vector> item_set_buffer; public: