Handle inlined rules that contain other inlined rules

2017-07-20 15:28:55 -07:00 · 2017-07-20 15:28:55 -07:00 · 7d9d8bce79
commit 7d9d8bce79
parent f33421c53e
6 changed files with 230 additions and 140 deletions
--- a/src/compiler/build_tables/parse_item_set_builder.cc
+++ b/src/compiler/build_tables/parse_item_set_builder.cc
@ -21,61 +21,58 @@ using std::unordered_map;
 using std::vector;
 using rules::Symbol;

-static vector<Production> inline_production(const ParseItem &item, const SyntaxGrammar &grammar) {
-  vector<Production> result;
-  auto &inlined_step = item.production->at(item.step_index);
-  auto &productions_to_insert = grammar.variables[inlined_step.symbol.index].productions;
-  for (const Production &production_to_insert : productions_to_insert) {
-    auto begin = item.production->steps.begin();
-    auto end = item.production->steps.end();
-    auto step = begin + item.step_index;
+struct FollowSetInfo {
+  LookaheadSet lookaheads;
+  bool propagates_lookaheads;
+};

-    Production production{{begin, step}, item.production->dynamic_precedence};
-    for (auto &step : production_to_insert) {
-      production.steps.push_back(step);
-      if (!inlined_step.name_replacement.empty()) {
-        production.steps.back().name_replacement = inlined_step.name_replacement;
-      }
-    }
-    production.back().precedence = inlined_step.precedence;
-    production.back().associativity = inlined_step.associativity;
-    production.steps.insert(
-      production.steps.end(),
-      step + 1,
-      end
-    );
+struct NonTerminalQueueEntry {
+  Symbol::Index non_terminal;
+  LookaheadSet lookaheads;
+  bool propagates_lookaheads;
+};

-    if (find(result.begin(), result.end(), production) == result.end()) {
-      result.push_back(move(production));
-    }
-  }
-  return result;
+bool ParseItemSetBuilder::ParseItemSetComponent::operator==(
+  const ParseItemSetBuilder::ParseItemSetComponent &other) const {
+  return item == other.item &&
+    lookaheads == other.lookaheads &&
+    propagates_lookaheads == other.propagates_lookaheads;
 }

-ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
-                                         const LexicalGrammar &lexical_grammar) : grammar{grammar} {
-  vector<Symbol> symbols_to_process;
-  set<Symbol::Index> processed_non_terminals;
+template <typename T>
+inline void find_or_push(vector<T> &vector, const T &item) {
+  if (find(vector.begin(), vector.end(), item) == vector.end()) {
+    vector.push_back(item);
+  }
+}

+ParseItemSetBuilder::ParseItemSetBuilder(
+  const SyntaxGrammar &grammar,
+  const LexicalGrammar &lexical_grammar
+) : grammar{grammar} {
+
+  // Populate the FIRST and LAST set of each terminal, which just contains the terminal itself.
  for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
    Symbol symbol = Symbol::terminal(i);
    first_sets.insert({symbol, LookaheadSet({symbol})});
    last_sets.insert({symbol, LookaheadSet({symbol})});
  }
-
  for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) {
    Symbol symbol = Symbol::external(i);
    first_sets.insert({symbol, LookaheadSet({symbol})});
    last_sets.insert({symbol, LookaheadSet({symbol})});
  }

+  // Populate the FIRST and LAST set of each non-terminal by recursively expanding non-terminals.
+  vector<Symbol> symbols_to_process;
+  set<Symbol::Index> processed_non_terminals;
  for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
    Symbol symbol = Symbol::non_terminal(i);
+    LookaheadSet &first_set = first_sets[symbol];
+    LookaheadSet &last_set = last_sets[symbol];

-    LookaheadSet first_set;
    processed_non_terminals.clear();
-    symbols_to_process.clear();
-    symbols_to_process.push_back(symbol);
+    symbols_to_process.assign({symbol});
    while (!symbols_to_process.empty()) {
      Symbol current_symbol = symbols_to_process.back();
      symbols_to_process.pop_back();
@ -91,12 +88,8 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
      }
    }

-    first_sets.insert({symbol, first_set});
-
-    LookaheadSet last_set;
    processed_non_terminals.clear();
-    symbols_to_process.clear();
-    symbols_to_process.push_back(symbol);
+    symbols_to_process.assign({symbol});
    while (!symbols_to_process.empty()) {
      Symbol current_symbol = symbols_to_process.back();
      symbols_to_process.pop_back();
@ -111,41 +104,27 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
        }
      }
    }
-
-    last_sets.insert({symbol, last_set});
  }

-  struct NonTerminalQueueEntry {
-    Symbol::Index non_terminal;
-    LookaheadSet lookaheads;
-    bool propagates_lookaheads;
-  };
+  // Populate a cache of which ParseItems will be created when a given non-terminal is expanded.
+  vector<NonTerminalQueueEntry> non_terminal_queue;
+  for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {

-  vector<NonTerminalQueueEntry> non_terminal_queue_entry;
-
-  for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
-    Symbol symbol = Symbol::non_terminal(i);
-
-    unordered_map<Symbol::Index, pair<LookaheadSet, bool>> cached_lookaheads_by_non_terminal;
-
-    non_terminal_queue_entry.clear();
-    non_terminal_queue_entry.push_back({
-      symbol.index,
-      LookaheadSet(),
-      true
-    });
-
-    while (!non_terminal_queue_entry.empty()) {
-      NonTerminalQueueEntry queue_entry = non_terminal_queue_entry.back();
-      non_terminal_queue_entry.pop_back();
+    // Compute the follow set of each *other* non-terminal that the current non-terminal can
+    // start with.
+    unordered_map<Symbol::Index, FollowSetInfo> follow_set_info_by_non_terminal;
+    non_terminal_queue.assign({{i, LookaheadSet(), true}});
+    while (!non_terminal_queue.empty()) {
+      NonTerminalQueueEntry queue_entry = non_terminal_queue.back();
+      non_terminal_queue.pop_back();

      bool queue_entry_is_new;
-      auto &cache_entry = cached_lookaheads_by_non_terminal[queue_entry.non_terminal];
+      auto &follow_set_info = follow_set_info_by_non_terminal[queue_entry.non_terminal];
      if (queue_entry.propagates_lookaheads) {
-        queue_entry_is_new = !cache_entry.second;
-        cache_entry.second = true;
+        queue_entry_is_new = !follow_set_info.propagates_lookaheads;
+        follow_set_info.propagates_lookaheads = true;
      } else {
-        queue_entry_is_new = cache_entry.first.insert_all(queue_entry.lookaheads);
+        queue_entry_is_new = follow_set_info.lookaheads.insert_all(queue_entry.lookaheads);
      }

      if (queue_entry_is_new) {
@ -165,7 +144,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
            propagates_lookaheads = false;
          }

-          non_terminal_queue_entry.push_back({
+          non_terminal_queue.push_back({
            next_symbol.index,
            next_lookaheads,
            propagates_lookaheads
@ -174,91 +153,135 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
      }
    }

-    for (auto &pair : cached_lookaheads_by_non_terminal) {
-      for (const Production &production : grammar.variables[pair.first].productions) {
-        Symbol lhs = Symbol::non_terminal(pair.first);
-        ParseItem item(lhs, production, 0);
+    // Use these follow sets to populate the cache of ParseItems for non-terminal `i`.
+    for (auto &pair : follow_set_info_by_non_terminal) {
+      Symbol non_terminal = Symbol::non_terminal(pair.first);
+
+      for (const Production &production : grammar.variables[non_terminal.index].productions) {
+        ParseItem item(non_terminal, production, 0);

        if (grammar.variables_to_inline.count(item.next_symbol())) {
-          vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
-          if (inlined_productions.empty()) {
-            inlined_productions = inline_production(item, grammar);
-          }
-
-          for (const Production &inlined_production : inlined_productions) {
-            ParseItemSetComponent component{
-              ParseItem(lhs, inlined_production, 0),
-              pair.second.first,
-              pair.second.second
-            };
-
-            if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
-              component_cache[i].push_back(component);
-            }
-          }
-        } else if (!grammar.variables_to_inline.count(lhs)) {
-          ParseItemSetComponent component{
-            ParseItem(lhs, production, 0),
-            pair.second.first,
-            pair.second.second
-          };
-
-          if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
-            component_cache[i].push_back(component);
+          for (const Production &inlined_production : inline_production(item)) {
+            find_or_push(transitive_closure_component_cache[i], {
+              ParseItem(non_terminal, inlined_production, 0),
+              pair.second.lookaheads,
+              pair.second.propagates_lookaheads
+            });
          }
+        } else if (!grammar.variables_to_inline.count(non_terminal)) {
+          find_or_push(transitive_closure_component_cache[i], {
+            item,
+            pair.second.lookaheads,
+            pair.second.propagates_lookaheads
+          });
        }
      }
    }
  }
 }

+const vector<Production> &ParseItemSetBuilder::inline_production(const ParseItem &item) {
+  vector<Production> &result = inlined_productions_by_original_production[item];
+  if (!result.empty()) return result;
+
+  auto &inlined_step = item.production->at(item.step_index);
+  vector<const Production *> productions_to_insert;
+  for (auto &production : grammar.variables[inlined_step.symbol.index].productions) {
+    productions_to_insert.push_back(&production);
+  }
+
+  for (auto iter = productions_to_insert.begin(); iter != productions_to_insert.end();) {
+    const Production *production = *iter;
+
+    if (!production->empty() && grammar.variables_to_inline.count(production->steps.front().symbol)) {
+      iter = productions_to_insert.erase(iter);
+      for (auto &production : inline_production(ParseItem(inlined_step.symbol, *production, 0))) {
+        iter = productions_to_insert.insert(iter, &production);
+      }
+    } else {
+      ++iter;
+    }
+  }
+
+  for (const Production *production_to_insert : productions_to_insert) {
+    auto begin = item.production->steps.begin();
+    auto end = item.production->steps.end();
+    auto step = begin + item.step_index;
+
+    Production production{{begin, step}, item.production->dynamic_precedence};
+    for (auto &step : *production_to_insert) {
+      production.steps.push_back(step);
+      if (!inlined_step.name_replacement.empty()) {
+        production.steps.back().name_replacement = inlined_step.name_replacement;
+      }
+    }
+    production.back().precedence = inlined_step.precedence;
+    production.back().associativity = inlined_step.associativity;
+    production.steps.insert(
+      production.steps.end(),
+      step + 1,
+      end
+    );
+
+    if (find(result.begin(), result.end(), production) == result.end()) {
+      result.push_back(move(production));
+    }
+  }
+
+  return result;
+}
+
 void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
  for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) {
    const ParseItem &item = iter->first;
    const LookaheadSet &lookaheads = iter->second;
-    if (item.lhs() != rules::START() && item.step_index == 0) {
-      ++iter;
-      continue;
-    }

-    const Symbol &next_symbol = item.next_symbol();
-    if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) {
-      ++iter;
-      continue;
-    }
+    // Items whose `step_index` is 0 are not part of the item set's "kernel"; they have been
+    // added in previous iterations of this loop, and they don't need to be further processed.
+    if (item.lhs() == rules::START() || item.step_index > 0) {

-    LookaheadSet next_lookaheads;
-    size_t next_step = item.step_index + 1;
-    if (next_step == item.production->size()) {
-      next_lookaheads = lookaheads;
-    } else {
-      Symbol symbol_after_next = item.production->at(next_step).symbol;
-      next_lookaheads = first_sets.find(symbol_after_next)->second;
-    }
+      // Kernel items whose next symbol is a non-terminal are expanded using the pre-computed
+      // parse item cache.
+      const Symbol &next_symbol = item.next_symbol();
+      if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) {

-    for (const ParseItemSetComponent &component : component_cache[next_symbol.index]) {
-      LookaheadSet &current_lookaheads = item_set->entries[component.item];
-      current_lookaheads.insert_all(component.lookaheads);
-      if (component.propagates_lookaheads) current_lookaheads.insert_all(next_lookaheads);
-    }
+        LookaheadSet next_lookaheads;
+        size_t next_step = item.step_index + 1;
+        if (next_step == item.production->size()) {
+          next_lookaheads = lookaheads;
+        } else {
+          Symbol symbol_after_next = item.production->at(next_step).symbol;
+          next_lookaheads = first_sets.find(symbol_after_next)->second;
+        }

-    if (grammar.variables_to_inline.count(next_symbol)) {
-      vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
-      if (inlined_productions.empty()) {
-        inlined_productions = inline_production(item, grammar);
-      }
-
-      for (const Production &inlined_production : inlined_productions) {
-        item_set->entries.insert({
-          ParseItem(item.lhs(), inlined_production, item.step_index),
-          lookaheads
-        });
+        for (const auto &component : transitive_closure_component_cache[next_symbol.index]) {
+          LookaheadSet &current_lookaheads = item_set->entries[component.item];
+          current_lookaheads.insert_all(component.lookaheads);
+          if (component.propagates_lookaheads) {
+            current_lookaheads.insert_all(next_lookaheads);
+          }
+        }
+
+        if (grammar.variables_to_inline.count(next_symbol)) {
+          for (const Production &inlined_production : inline_production(item)) {
+            item_set->entries.insert({
+              ParseItem(item.lhs(), inlined_production, item.step_index),
+              lookaheads
+            });
+          }
+
+          iter = item_set->entries.erase(iter);
+          continue;
+        }
      }
+    }

+    if (grammar.variables_to_inline.count(item.lhs())) {
      iter = item_set->entries.erase(iter);
-    } else {
-      ++iter;
+      continue;
    }
+
+    ++iter;
  }
 }

--- a/src/compiler/build_tables/parse_item_set_builder.h
+++ b/src/compiler/build_tables/parse_item_set_builder.h
@ -18,19 +18,15 @@ class ParseItemSetBuilder {
    ParseItem item;
    LookaheadSet lookaheads;
    bool propagates_lookaheads;
-
-    inline bool operator==(const ParseItemSetComponent &other) {
-      return item == other.item &&
-        lookaheads == other.lookaheads &&
-        propagates_lookaheads == other.propagates_lookaheads;
-    }
+    bool operator==(const ParseItemSetComponent &) const;
  };

  const SyntaxGrammar &grammar;
  std::map<rules::Symbol, LookaheadSet> first_sets;
  std::map<rules::Symbol, LookaheadSet> last_sets;
-  std::map<rules::Symbol::Index, std::vector<ParseItemSetComponent>> component_cache;
+  std::map<rules::Symbol::Index, std::vector<ParseItemSetComponent>> transitive_closure_component_cache;
  std::map<ParseItem, std::vector<Production>> inlined_productions_by_original_production;
+  const std::vector<Production> &inline_production(const ParseItem &);

 public:
  ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &);
--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@ -15,6 +15,7 @@ ParseAction::ParseAction()
  : production(nullptr),
    consumed_symbol_count(0),
    symbol(rules::NONE()),
+    dynamic_precedence(0),
    type(ParseActionTypeError),
    extra(false),
    fragile(false),
@ -72,6 +73,7 @@ bool ParseAction::operator==(const ParseAction &other) const {
    state_index == other.state_index &&
    production == other.production &&
    consumed_symbol_count == other.consumed_symbol_count &&
+    dynamic_precedence == other.dynamic_precedence &&
    rename_sequence_id == other.rename_sequence_id;
 }

@ -90,6 +92,8 @@ bool ParseAction::operator<(const ParseAction &other) const {
  if (other.production < production) return false;
  if (consumed_symbol_count < other.consumed_symbol_count) return true;
  if (other.consumed_symbol_count < consumed_symbol_count) return false;
+  if (dynamic_precedence < other.dynamic_precedence) return true;
+  if (other.dynamic_precedence < dynamic_precedence) return false;
  return rename_sequence_id < other.rename_sequence_id;
 }

--- a/test/fixtures/test_grammars/nested_inlined_rules/corpus.txt
+++ b/test/fixtures/test_grammars/nested_inlined_rules/corpus.txt
@ -0,0 +1,12 @@
+==================================
+Statements
+==================================
+
+return 1;
+return 2;
+
+---
+
+(program
+  (return_statement (number))
+  (return_statement (number)))
--- a/test/fixtures/test_grammars/nested_inlined_rules/grammar.json
+++ b/test/fixtures/test_grammars/nested_inlined_rules/grammar.json
@ -0,0 +1,54 @@
+{
+  "name": "nested_inlined_rules",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "inline": [
+    "top_level_item",
+    "statement"
+  ],
+
+  "rules": {
+    "program": {
+      "type": "REPEAT1",
+      "content": {"type": "SYMBOL", "name": "top_level_item"}
+    },
+
+    "top_level_item": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "statement"},
+        {"type": "STRING", "value": "!"}
+      ]
+    },
+
+    "statement": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "expression_statement"},
+        {"type": "SYMBOL", "name": "return_statement"}
+      ]
+    },
+
+    "return_statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "return"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "STRING", "value": ";"}
+      ]
+    },
+
+    "expression_statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "STRING", "value": ";"}
+      ]
+    },
+
+    "number": {"type": "PATTERN", "value": "\\d+"}
+  }
+}
--- a/test/fixtures/test_grammars/nested_inlined_rules/readme.md
+++ b/test/fixtures/test_grammars/nested_inlined_rules/readme.md
@ -0,0 +1 @@
+This grammar demonstrates that you can have an inlined rule that contains another inlined rule.
				`@ -0,0 +1 @@`
				`This grammar demonstrates that you can have an inlined rule that contains another inlined rule.`