diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 31bb836b..2e8aa279 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -21,61 +21,58 @@ using std::unordered_map; using std::vector; using rules::Symbol; -static vector inline_production(const ParseItem &item, const SyntaxGrammar &grammar) { - vector result; - auto &inlined_step = item.production->at(item.step_index); - auto &productions_to_insert = grammar.variables[inlined_step.symbol.index].productions; - for (const Production &production_to_insert : productions_to_insert) { - auto begin = item.production->steps.begin(); - auto end = item.production->steps.end(); - auto step = begin + item.step_index; +struct FollowSetInfo { + LookaheadSet lookaheads; + bool propagates_lookaheads; +}; - Production production{{begin, step}, item.production->dynamic_precedence}; - for (auto &step : production_to_insert) { - production.steps.push_back(step); - if (!inlined_step.name_replacement.empty()) { - production.steps.back().name_replacement = inlined_step.name_replacement; - } - } - production.back().precedence = inlined_step.precedence; - production.back().associativity = inlined_step.associativity; - production.steps.insert( - production.steps.end(), - step + 1, - end - ); +struct NonTerminalQueueEntry { + Symbol::Index non_terminal; + LookaheadSet lookaheads; + bool propagates_lookaheads; +}; - if (find(result.begin(), result.end(), production) == result.end()) { - result.push_back(move(production)); - } - } - return result; +bool ParseItemSetBuilder::ParseItemSetComponent::operator==( + const ParseItemSetBuilder::ParseItemSetComponent &other) const { + return item == other.item && + lookaheads == other.lookaheads && + propagates_lookaheads == other.propagates_lookaheads; } -ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, - const LexicalGrammar &lexical_grammar) : grammar{grammar} { - vector symbols_to_process; - set processed_non_terminals; +template +inline void find_or_push(vector &vector, const T &item) { + if (find(vector.begin(), vector.end(), item) == vector.end()) { + vector.push_back(item); + } +} +ParseItemSetBuilder::ParseItemSetBuilder( + const SyntaxGrammar &grammar, + const LexicalGrammar &lexical_grammar +) : grammar{grammar} { + + // Populate the FIRST and LAST set of each terminal, which just contains the terminal itself. for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::terminal(i); first_sets.insert({symbol, LookaheadSet({symbol})}); last_sets.insert({symbol, LookaheadSet({symbol})}); } - for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { Symbol symbol = Symbol::external(i); first_sets.insert({symbol, LookaheadSet({symbol})}); last_sets.insert({symbol, LookaheadSet({symbol})}); } + // Populate the FIRST and LAST set of each non-terminal by recursively expanding non-terminals. + vector symbols_to_process; + set processed_non_terminals; for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::non_terminal(i); + LookaheadSet &first_set = first_sets[symbol]; + LookaheadSet &last_set = last_sets[symbol]; - LookaheadSet first_set; processed_non_terminals.clear(); - symbols_to_process.clear(); - symbols_to_process.push_back(symbol); + symbols_to_process.assign({symbol}); while (!symbols_to_process.empty()) { Symbol current_symbol = symbols_to_process.back(); symbols_to_process.pop_back(); @@ -91,12 +88,8 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, } } - first_sets.insert({symbol, first_set}); - - LookaheadSet last_set; processed_non_terminals.clear(); - symbols_to_process.clear(); - symbols_to_process.push_back(symbol); + symbols_to_process.assign({symbol}); while (!symbols_to_process.empty()) { Symbol current_symbol = symbols_to_process.back(); symbols_to_process.pop_back(); @@ -111,41 +104,27 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, } } } - - last_sets.insert({symbol, last_set}); } - struct NonTerminalQueueEntry { - Symbol::Index non_terminal; - LookaheadSet lookaheads; - bool propagates_lookaheads; - }; + // Populate a cache of which ParseItems will be created when a given non-terminal is expanded. + vector non_terminal_queue; + for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { - vector non_terminal_queue_entry; - - for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol = Symbol::non_terminal(i); - - unordered_map> cached_lookaheads_by_non_terminal; - - non_terminal_queue_entry.clear(); - non_terminal_queue_entry.push_back({ - symbol.index, - LookaheadSet(), - true - }); - - while (!non_terminal_queue_entry.empty()) { - NonTerminalQueueEntry queue_entry = non_terminal_queue_entry.back(); - non_terminal_queue_entry.pop_back(); + // Compute the follow set of each *other* non-terminal that the current non-terminal can + // start with. + unordered_map follow_set_info_by_non_terminal; + non_terminal_queue.assign({{i, LookaheadSet(), true}}); + while (!non_terminal_queue.empty()) { + NonTerminalQueueEntry queue_entry = non_terminal_queue.back(); + non_terminal_queue.pop_back(); bool queue_entry_is_new; - auto &cache_entry = cached_lookaheads_by_non_terminal[queue_entry.non_terminal]; + auto &follow_set_info = follow_set_info_by_non_terminal[queue_entry.non_terminal]; if (queue_entry.propagates_lookaheads) { - queue_entry_is_new = !cache_entry.second; - cache_entry.second = true; + queue_entry_is_new = !follow_set_info.propagates_lookaheads; + follow_set_info.propagates_lookaheads = true; } else { - queue_entry_is_new = cache_entry.first.insert_all(queue_entry.lookaheads); + queue_entry_is_new = follow_set_info.lookaheads.insert_all(queue_entry.lookaheads); } if (queue_entry_is_new) { @@ -165,7 +144,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, propagates_lookaheads = false; } - non_terminal_queue_entry.push_back({ + non_terminal_queue.push_back({ next_symbol.index, next_lookaheads, propagates_lookaheads @@ -174,91 +153,135 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, } } - for (auto &pair : cached_lookaheads_by_non_terminal) { - for (const Production &production : grammar.variables[pair.first].productions) { - Symbol lhs = Symbol::non_terminal(pair.first); - ParseItem item(lhs, production, 0); + // Use these follow sets to populate the cache of ParseItems for non-terminal `i`. + for (auto &pair : follow_set_info_by_non_terminal) { + Symbol non_terminal = Symbol::non_terminal(pair.first); + + for (const Production &production : grammar.variables[non_terminal.index].productions) { + ParseItem item(non_terminal, production, 0); if (grammar.variables_to_inline.count(item.next_symbol())) { - vector &inlined_productions = inlined_productions_by_original_production[item]; - if (inlined_productions.empty()) { - inlined_productions = inline_production(item, grammar); - } - - for (const Production &inlined_production : inlined_productions) { - ParseItemSetComponent component{ - ParseItem(lhs, inlined_production, 0), - pair.second.first, - pair.second.second - }; - - if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) { - component_cache[i].push_back(component); - } - } - } else if (!grammar.variables_to_inline.count(lhs)) { - ParseItemSetComponent component{ - ParseItem(lhs, production, 0), - pair.second.first, - pair.second.second - }; - - if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) { - component_cache[i].push_back(component); + for (const Production &inlined_production : inline_production(item)) { + find_or_push(transitive_closure_component_cache[i], { + ParseItem(non_terminal, inlined_production, 0), + pair.second.lookaheads, + pair.second.propagates_lookaheads + }); } + } else if (!grammar.variables_to_inline.count(non_terminal)) { + find_or_push(transitive_closure_component_cache[i], { + item, + pair.second.lookaheads, + pair.second.propagates_lookaheads + }); } } } } } +const vector &ParseItemSetBuilder::inline_production(const ParseItem &item) { + vector &result = inlined_productions_by_original_production[item]; + if (!result.empty()) return result; + + auto &inlined_step = item.production->at(item.step_index); + vector productions_to_insert; + for (auto &production : grammar.variables[inlined_step.symbol.index].productions) { + productions_to_insert.push_back(&production); + } + + for (auto iter = productions_to_insert.begin(); iter != productions_to_insert.end();) { + const Production *production = *iter; + + if (!production->empty() && grammar.variables_to_inline.count(production->steps.front().symbol)) { + iter = productions_to_insert.erase(iter); + for (auto &production : inline_production(ParseItem(inlined_step.symbol, *production, 0))) { + iter = productions_to_insert.insert(iter, &production); + } + } else { + ++iter; + } + } + + for (const Production *production_to_insert : productions_to_insert) { + auto begin = item.production->steps.begin(); + auto end = item.production->steps.end(); + auto step = begin + item.step_index; + + Production production{{begin, step}, item.production->dynamic_precedence}; + for (auto &step : *production_to_insert) { + production.steps.push_back(step); + if (!inlined_step.name_replacement.empty()) { + production.steps.back().name_replacement = inlined_step.name_replacement; + } + } + production.back().precedence = inlined_step.precedence; + production.back().associativity = inlined_step.associativity; + production.steps.insert( + production.steps.end(), + step + 1, + end + ); + + if (find(result.begin(), result.end(), production) == result.end()) { + result.push_back(move(production)); + } + } + + return result; +} + void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) { const ParseItem &item = iter->first; const LookaheadSet &lookaheads = iter->second; - if (item.lhs() != rules::START() && item.step_index == 0) { - ++iter; - continue; - } - const Symbol &next_symbol = item.next_symbol(); - if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) { - ++iter; - continue; - } + // Items whose `step_index` is 0 are not part of the item set's "kernel"; they have been + // added in previous iterations of this loop, and they don't need to be further processed. + if (item.lhs() == rules::START() || item.step_index > 0) { - LookaheadSet next_lookaheads; - size_t next_step = item.step_index + 1; - if (next_step == item.production->size()) { - next_lookaheads = lookaheads; - } else { - Symbol symbol_after_next = item.production->at(next_step).symbol; - next_lookaheads = first_sets.find(symbol_after_next)->second; - } + // Kernel items whose next symbol is a non-terminal are expanded using the pre-computed + // parse item cache. + const Symbol &next_symbol = item.next_symbol(); + if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) { - for (const ParseItemSetComponent &component : component_cache[next_symbol.index]) { - LookaheadSet ¤t_lookaheads = item_set->entries[component.item]; - current_lookaheads.insert_all(component.lookaheads); - if (component.propagates_lookaheads) current_lookaheads.insert_all(next_lookaheads); - } + LookaheadSet next_lookaheads; + size_t next_step = item.step_index + 1; + if (next_step == item.production->size()) { + next_lookaheads = lookaheads; + } else { + Symbol symbol_after_next = item.production->at(next_step).symbol; + next_lookaheads = first_sets.find(symbol_after_next)->second; + } - if (grammar.variables_to_inline.count(next_symbol)) { - vector &inlined_productions = inlined_productions_by_original_production[item]; - if (inlined_productions.empty()) { - inlined_productions = inline_production(item, grammar); - } - - for (const Production &inlined_production : inlined_productions) { - item_set->entries.insert({ - ParseItem(item.lhs(), inlined_production, item.step_index), - lookaheads - }); + for (const auto &component : transitive_closure_component_cache[next_symbol.index]) { + LookaheadSet ¤t_lookaheads = item_set->entries[component.item]; + current_lookaheads.insert_all(component.lookaheads); + if (component.propagates_lookaheads) { + current_lookaheads.insert_all(next_lookaheads); + } + } + + if (grammar.variables_to_inline.count(next_symbol)) { + for (const Production &inlined_production : inline_production(item)) { + item_set->entries.insert({ + ParseItem(item.lhs(), inlined_production, item.step_index), + lookaheads + }); + } + + iter = item_set->entries.erase(iter); + continue; + } } + } + if (grammar.variables_to_inline.count(item.lhs())) { iter = item_set->entries.erase(iter); - } else { - ++iter; + continue; } + + ++iter; } } diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index c180f38b..3a8347e8 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -18,19 +18,15 @@ class ParseItemSetBuilder { ParseItem item; LookaheadSet lookaheads; bool propagates_lookaheads; - - inline bool operator==(const ParseItemSetComponent &other) { - return item == other.item && - lookaheads == other.lookaheads && - propagates_lookaheads == other.propagates_lookaheads; - } + bool operator==(const ParseItemSetComponent &) const; }; const SyntaxGrammar &grammar; std::map first_sets; std::map last_sets; - std::map> component_cache; + std::map> transitive_closure_component_cache; std::map> inlined_productions_by_original_production; + const std::vector &inline_production(const ParseItem &); public: ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index 35c3daba..1662815e 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -15,6 +15,7 @@ ParseAction::ParseAction() : production(nullptr), consumed_symbol_count(0), symbol(rules::NONE()), + dynamic_precedence(0), type(ParseActionTypeError), extra(false), fragile(false), @@ -72,6 +73,7 @@ bool ParseAction::operator==(const ParseAction &other) const { state_index == other.state_index && production == other.production && consumed_symbol_count == other.consumed_symbol_count && + dynamic_precedence == other.dynamic_precedence && rename_sequence_id == other.rename_sequence_id; } @@ -90,6 +92,8 @@ bool ParseAction::operator<(const ParseAction &other) const { if (other.production < production) return false; if (consumed_symbol_count < other.consumed_symbol_count) return true; if (other.consumed_symbol_count < consumed_symbol_count) return false; + if (dynamic_precedence < other.dynamic_precedence) return true; + if (other.dynamic_precedence < dynamic_precedence) return false; return rename_sequence_id < other.rename_sequence_id; } diff --git a/test/fixtures/test_grammars/nested_inlined_rules/corpus.txt b/test/fixtures/test_grammars/nested_inlined_rules/corpus.txt new file mode 100644 index 00000000..509f3899 --- /dev/null +++ b/test/fixtures/test_grammars/nested_inlined_rules/corpus.txt @@ -0,0 +1,12 @@ +================================== +Statements +================================== + +return 1; +return 2; + +--- + +(program + (return_statement (number)) + (return_statement (number))) diff --git a/test/fixtures/test_grammars/nested_inlined_rules/grammar.json b/test/fixtures/test_grammars/nested_inlined_rules/grammar.json new file mode 100644 index 00000000..f240de1f --- /dev/null +++ b/test/fixtures/test_grammars/nested_inlined_rules/grammar.json @@ -0,0 +1,54 @@ +{ + "name": "nested_inlined_rules", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "inline": [ + "top_level_item", + "statement" + ], + + "rules": { + "program": { + "type": "REPEAT1", + "content": {"type": "SYMBOL", "name": "top_level_item"} + }, + + "top_level_item": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "statement"}, + {"type": "STRING", "value": "!"} + ] + }, + + "statement": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "expression_statement"}, + {"type": "SYMBOL", "name": "return_statement"} + ] + }, + + "return_statement": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "return"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "STRING", "value": ";"} + ] + }, + + "expression_statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "number"}, + {"type": "STRING", "value": ";"} + ] + }, + + "number": {"type": "PATTERN", "value": "\\d+"} + } +} diff --git a/test/fixtures/test_grammars/nested_inlined_rules/readme.md b/test/fixtures/test_grammars/nested_inlined_rules/readme.md new file mode 100644 index 00000000..6836100f --- /dev/null +++ b/test/fixtures/test_grammars/nested_inlined_rules/readme.md @@ -0,0 +1 @@ +This grammar demonstrates that you can have an inlined rule that contains another inlined rule.