Restructure parse state merging logic

* Remove remnants of templatized remove_duplicate_states function * Rename recovery_tokens function to get_compatible_tokens and augment it also compute pairs of tokens which could potentially be incompatible
2017-02-26 12:23:35 -08:00 · 2017-02-26 12:23:35 -08:00 · 3c8e6f9987
commit 3c8e6f9987
parent 8d3b72e1d9
13 changed files with 274 additions and 252 deletions
--- a/project.gyp
+++ b/project.gyp
@ -14,7 +14,7 @@
        'src/compiler/build_tables/build_lex_table.cc',
        'src/compiler/build_tables/build_parse_table.cc',
        'src/compiler/build_tables/build_tables.cc',
-        'src/compiler/build_tables/recovery_tokens.cc',
+        'src/compiler/build_tables/compatible_tokens.cc',
        'src/compiler/build_tables/lex_item.cc',
        'src/compiler/build_tables/lex_item_transitions.cc',
        'src/compiler/build_tables/lex_conflict_manager.cc',
--- a/spec/compiler/build_tables/distinctive_tokens_spec.cc
+++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc
@ -1,6 +1,6 @@
 #include "spec_helper.h"
 #include "compiler/rules/character_set.h"
-#include "compiler/build_tables/recovery_tokens.h"
+#include "compiler/build_tables/compatible_tokens.h"
 #include "compiler/lexical_grammar.h"
 #include "helpers/rule_helpers.h"
 #include "helpers/stream_methods.h"
@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() {
      })),
    };

-    AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
+    AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
  });
 });

--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -7,7 +7,6 @@
 #include <utility>
 #include <vector>
 #include "compiler/build_tables/lex_conflict_manager.h"
-#include "compiler/build_tables/remove_duplicate_states.h"
 #include "compiler/build_tables/lex_item.h"
 #include "compiler/parse_table.h"
 #include "compiler/lexical_grammar.h"
@ -143,13 +142,64 @@ class LexTableBuilder {
      state.accept_action.precedence = 0;
    }

-    auto replacements =
-      remove_duplicate_states<LexTable>(&lex_table);
+    map<LexStateId, LexStateId> replacements;
+
+    while (true) {
+      map<LexStateId, LexStateId> duplicates;
+      for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
+        for (LexStateId j = 0; j < i; j++) {
+          if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) {
+            duplicates.insert({ i, j });
+            break;
+          }
+        }
+      }
+
+      if (duplicates.empty()) break;
+
+      map<size_t, size_t> new_replacements;
+      for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
+        LexStateId new_state_index = i;
+        auto duplicate = duplicates.find(i);
+        if (duplicate != duplicates.end()) {
+          new_state_index = duplicate->second;
+        }
+
+        size_t prior_removed = 0;
+        for (const auto &duplicate : duplicates) {
+          if (duplicate.first >= new_state_index) break;
+          prior_removed++;
+        }
+
+        new_state_index -= prior_removed;
+        new_replacements.insert({ i, new_state_index });
+        replacements.insert({ i, new_state_index });
+        for (auto &replacement : replacements) {
+          if (replacement.second == i) {
+            replacement.second = new_state_index;
+          }
+        }
+      }
+
+      for (auto &state : lex_table.states) {
+        for (auto &entry : state.advance_actions) {
+          auto new_replacement = new_replacements.find(entry.second.state_index);
+          if (new_replacement != new_replacements.end()) {
+            entry.second.state_index = new_replacement->second;
+          }
+        }
+      }
+
+      for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
+        lex_table.states.erase(lex_table.states.begin() + i->first);
+      }
+    }

    for (ParseState &parse_state : parse_table->states) {
      auto replacement = replacements.find(parse_state.lex_state_id);
-      if (replacement != replacements.end())
+      if (replacement != replacements.end()) {
        parse_state.lex_state_id = replacement->second;
+      }
    }
  }

--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -6,14 +6,13 @@
 #include <unordered_map>
 #include <utility>
 #include "compiler/parse_table.h"
-#include "compiler/build_tables/remove_duplicate_states.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/parse_item_set_builder.h"
 #include "compiler/lexical_grammar.h"
 #include "compiler/syntax_grammar.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/built_in_symbols.h"
-#include "compiler/build_tables/recovery_tokens.h"
+#include "compiler/build_tables/compatible_tokens.h"

 namespace tree_sitter {
 namespace build_tables {
@ -41,6 +40,7 @@ class ParseTableBuilder {
  set<string> conflicts;
  ParseItemSetBuilder item_set_builder;
  set<const Production *> fragile_productions;
+  CompatibleTokensResult compatible_tokens;
  bool allow_any_conflict;

 public:
@ -49,6 +49,7 @@ class ParseTableBuilder {
      : grammar(grammar),
        lexical_grammar(lex_grammar),
        item_set_builder(grammar, lex_grammar),
+        compatible_tokens(get_compatible_tokens(lex_grammar)),
        allow_any_conflict(false) {}

  pair<ParseTable, CompileError> build() {
@ -74,7 +75,7 @@ class ParseTableBuilder {
    if (error.type != TSCompileErrorTypeNone)
      return { parse_table, error };

-    parse_table.mergeable_symbols = recovery_tokens(lexical_grammar);
+    parse_table.mergeable_symbols = compatible_tokens.recovery_tokens;

    build_error_parse_state();

@ -302,7 +303,7 @@ class ParseTableBuilder {
    set<ParseStateId> deleted_states;

    while (true) {
-      std::map<ParseStateId, ParseStateId> state_replacements;
+      map<ParseStateId, ParseStateId> state_replacements;

      for (auto &pair : state_indices_by_signature) {
        auto &state_group = pair.second;
@ -310,7 +311,7 @@ class ParseTableBuilder {
        for (ParseStateId i : state_group) {
          for (ParseStateId j : state_group) {
            if (j == i) break;
-            if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
+            if (!state_replacements.count(j) && merge_parse_state(j, i)) {
              state_replacements.insert({ i, j });
              deleted_states.insert(i);
              break;
@ -364,6 +365,60 @@ class ParseTableBuilder {
    }
  }

+  static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
+    for (const auto &pair : state.terminal_entries)
+      if (pair.second == entry)
+        return true;
+    return false;
+  }
+
+  bool merge_parse_state(size_t i, size_t j) {
+    ParseState &state = parse_table.states[i];
+    ParseState &other = parse_table.states[j];
+
+    if (state.nonterminal_entries != other.nonterminal_entries)
+      return false;
+
+    for (auto &entry : state.terminal_entries) {
+      Symbol lookahead = entry.first;
+      const vector<ParseAction> &actions = entry.second.actions;
+
+      const auto &other_entry = other.terminal_entries.find(lookahead);
+      if (other_entry == other.terminal_entries.end()) {
+        if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
+          return false;
+        if (actions.back().type != ParseActionTypeReduce)
+          return false;
+        if (!has_entry(other, entry.second))
+          return false;
+      } else if (entry.second != other_entry->second) {
+        return false;
+      }
+    }
+
+    set<Symbol> symbols_to_merge;
+
+    for (auto &entry : other.terminal_entries) {
+      Symbol lookahead = entry.first;
+      const vector<ParseAction> &actions = entry.second.actions;
+
+      if (!state.terminal_entries.count(lookahead)) {
+        if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
+          return false;
+        if (actions.back().type != ParseActionTypeReduce)
+          return false;
+        if (!has_entry(state, entry.second))
+          return false;
+        symbols_to_merge.insert(lookahead);
+      }
+    }
+
+    for (const Symbol &lookahead : symbols_to_merge)
+      state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
+
+    return true;
+  }
+
  string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
                         Symbol lookahead) {
    ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
--- a/src/compiler/build_tables/compatible_tokens.cc
+++ b/src/compiler/build_tables/compatible_tokens.cc
@ -0,0 +1,132 @@
+#include "compiler/build_tables/compatible_tokens.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/rules/choice.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/visitor.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/metadata.h"
+
+namespace tree_sitter {
+namespace build_tables {
+
+using rules::Symbol;
+using std::set;
+
+template <bool left, bool right>
+class CharacterAggregator : public rules::RuleFn<void> {
+  void apply_to(const rules::Seq *rule) {
+    if (left) apply(rule->left);
+    if (right) apply(rule->right);
+  }
+
+  void apply_to(const rules::Choice *rule) {
+    for (const rule_ptr &element : rule->elements) {
+      apply(element);
+    }
+  }
+
+  void apply_to(const rules::Repeat *rule) {
+    apply(rule->content);
+  }
+
+  void apply_to(const rules::Metadata *rule) {
+    apply(rule->rule);
+  }
+
+  void apply_to(const rules::CharacterSet *rule) {
+    result.add_set(*rule);
+  }
+
+ public:
+  rules::CharacterSet result;
+};
+
+template <bool left, bool right>
+class CharacterIntersector : public rules::RuleFn<bool> {
+  bool apply_to(const rules::Seq *rule) {
+    bool result = false;
+    if (left) result = apply(rule->left);
+    if (right && !result) result = apply(rule->right);
+    return result;
+  }
+
+  bool apply_to(const rules::Choice *rule) {
+    for (const rule_ptr &element : rule->elements) {
+      if (apply(element)) return true;
+    }
+    return false;
+  }
+
+  bool apply_to(const rules::Repeat *rule) {
+    return apply(rule->content);
+  }
+
+  bool apply_to(const rules::Metadata *rule) {
+    return apply(rule->rule);
+  }
+
+  bool apply_to(const rules::CharacterSet *rule) {
+    return character_set->intersects(*rule);
+  }
+
+ public:
+  rules::CharacterSet *character_set;
+
+  CharacterIntersector(rules::CharacterSet *set) : character_set {set} {}
+};
+
+using FirstCharacters = CharacterAggregator<true, false>;
+using LastCharacters = CharacterAggregator<false, true>;
+using AllCharacters = CharacterAggregator<true, true>;
+using FirstCharactersIntersector = CharacterIntersector<true, false>;
+
+CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
+  CompatibleTokensResult result;
+
+  AllCharacters all_separator_characters;
+  for (const rule_ptr &separator : grammar.separators)
+    all_separator_characters.apply(separator);
+
+  for (size_t i = 0; i < grammar.variables.size(); i++) {
+    Symbol symbol(i, Symbol::Terminal);
+    rule_ptr rule = grammar.variables[i].rule;
+
+    FirstCharacters first_characters;
+    first_characters.apply(rule);
+
+    LastCharacters last_characters;
+    last_characters.apply(rule);
+
+    AllCharacters all_characters;
+    all_characters.apply(rule);
+
+    bool has_distinct_start =
+      !first_characters.result.includes_all &&
+      !first_characters.result.intersects(all_separator_characters.result);
+
+    bool has_distinct_end =
+      !last_characters.result.includes_all &&
+      !last_characters.result.intersects(all_separator_characters.result);
+
+    bool has_no_separators =
+      !all_characters.result.intersects(all_separator_characters.result);
+
+    if ((has_distinct_start && has_distinct_end) || has_no_separators)
+      result.recovery_tokens.insert(symbol);
+
+    for (size_t j = 0; j < grammar.variables.size(); j++) {
+      if (j == i) continue;
+      Symbol other_symbol(j, Symbol::Terminal);
+      FirstCharactersIntersector intersector(&first_characters.result);
+      if (intersector.apply(grammar.variables[j].rule)) {
+        result.unmergeable_pairs[symbol].insert(other_symbol);
+      }
+    }
+  }
+
+  return result;
+}
+
+}  // namespace build_tables
+}  // namespace tree_sitter
--- a/src/compiler/build_tables/compatible_tokens.h
+++ b/src/compiler/build_tables/compatible_tokens.h
@ -0,0 +1,25 @@
+#ifndef COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_
+#define COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_
+
+#include "compiler/rule.h"
+#include "compiler/rules/symbol.h"
+#include <map>
+#include <set>
+
+namespace tree_sitter {
+
+struct LexicalGrammar;
+
+namespace build_tables {
+
+struct CompatibleTokensResult {
+  std::set<rules::Symbol> recovery_tokens;
+  std::map<rules::Symbol, std::set<rules::Symbol>> unmergeable_pairs;
+};
+
+CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &);
+
+}  // namespace build_tables
+}  // namespace tree_sitter
+
+#endif  // COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_
--- a/src/compiler/build_tables/recovery_tokens.cc
+++ b/src/compiler/build_tables/recovery_tokens.cc
@ -1,89 +0,0 @@
-#include "compiler/build_tables/recovery_tokens.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/visitor.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/metadata.h"
-
-namespace tree_sitter {
-namespace build_tables {
-
-using rules::Symbol;
-using std::set;
-
-template <bool left, bool right>
-class CharacterAggregator : public rules::RuleFn<void> {
-  void apply_to(const rules::Seq *rule) {
-    if (left)
-      apply(rule->left);
-    if (right)
-      apply(rule->right);
-  }
-
-  void apply_to(const rules::Choice *rule) {
-    for (const rule_ptr &element : rule->elements)
-      apply(element);
-  }
-
-  void apply_to(const rules::Repeat *rule) {
-    apply(rule->content);
-  }
-
-  void apply_to(const rules::Metadata *rule) {
-    apply(rule->rule);
-  }
-
-  void apply_to(const rules::CharacterSet *rule) {
-    result.add_set(*rule);
-  }
-
- public:
-  rules::CharacterSet result;
-};
-
-class FirstCharacters : public CharacterAggregator<true, false> {};
-class LastCharacters : public CharacterAggregator<false, true> {};
-class AllCharacters : public CharacterAggregator<true, true> {};
-
-set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
-  set<Symbol> result;
-
-  AllCharacters all_separator_characters;
-  for (const rule_ptr &separator : grammar.separators)
-    all_separator_characters.apply(separator);
-
-  for (size_t i = 0; i < grammar.variables.size(); i++) {
-    const Variable &variable = grammar.variables[i];
-    rule_ptr rule = variable.rule;
-
-    FirstCharacters first_characters;
-    first_characters.apply(variable.rule);
-
-    LastCharacters last_characters;
-    last_characters.apply(variable.rule);
-
-    AllCharacters all_characters;
-    all_characters.apply(variable.rule);
-
-    bool has_distinct_start =
-      !first_characters.result.includes_all &&
-      !first_characters.result.intersects(all_separator_characters.result);
-
-    bool has_distinct_end =
-      !last_characters.result.includes_all &&
-      !last_characters.result.intersects(all_separator_characters.result);
-
-    bool has_no_separators =
-      !all_characters.result.intersects(all_separator_characters.result);
-
-    if ((has_distinct_start && has_distinct_end) || has_no_separators)
-      result.insert(Symbol(i, Symbol::Terminal));
-  }
-
-  return result;
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
--- a/src/compiler/build_tables/recovery_tokens.h
+++ b/src/compiler/build_tables/recovery_tokens.h
@ -1,19 +0,0 @@
-#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
-#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
-
-#include "compiler/rule.h"
-#include "compiler/rules/symbol.h"
-#include <set>
-
-namespace tree_sitter {
-
-struct LexicalGrammar;
-
-namespace build_tables {
-
-std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
-
-}  // namespace build_tables
-}  // namespace tree_sitter
-
-#endif  // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
--- a/src/compiler/build_tables/remove_duplicate_states.h
+++ b/src/compiler/build_tables/remove_duplicate_states.h
@ -1,65 +0,0 @@
-#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
-#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
-
-#include <map>
-#include <vector>
-
-namespace tree_sitter {
-namespace build_tables {
-
-template <typename TableType>
-std::map<size_t, size_t> remove_duplicate_states(TableType *table) {
-  std::map<size_t, size_t> replacements;
-
-  while (true) {
-    std::map<size_t, size_t> duplicates;
-    for (size_t i = 0, size = table->states.size(); i < size; i++)
-      for (size_t j = 0; j < i; j++)
-        if (!duplicates.count(j) && table->merge_state(j, i)) {
-          duplicates.insert({ i, j });
-          break;
-        }
-
-    if (duplicates.empty())
-      break;
-
-    std::map<size_t, size_t> new_replacements;
-    for (size_t i = 0, size = table->states.size(); i < size; i++) {
-      size_t new_state_index = i;
-      auto duplicate = duplicates.find(i);
-      if (duplicate != duplicates.end())
-        new_state_index = duplicate->second;
-
-      size_t prior_removed = 0;
-      for (const auto &duplicate : duplicates) {
-        if (duplicate.first >= new_state_index)
-          break;
-        prior_removed++;
-      }
-
-      new_state_index -= prior_removed;
-      new_replacements.insert({ i, new_state_index });
-      replacements.insert({ i, new_state_index });
-      for (auto &replacement : replacements)
-        if (replacement.second == i)
-          replacement.second = new_state_index;
-    }
-
-    for (auto &state : table->states)
-      state.each_referenced_state([&new_replacements](int64_t *state_index) {
-        auto new_replacement = new_replacements.find(*state_index);
-        if (new_replacement != new_replacements.end())
-          *state_index = new_replacement->second;
-      });
-
-    for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i)
-      table->states.erase(table->states.begin() + i->first);
-  }
-
-  return replacements;
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
-
-#endif  // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
--- a/src/compiler/lex_table.cc
+++ b/src/compiler/lex_table.cc
@ -57,11 +57,6 @@ bool LexState::operator==(const LexState &other) const {
         is_token_start == other.is_token_start;
 }

-void LexState::each_referenced_state(function<void(LexStateId *)> fn) {
-  for (auto &entry : advance_actions)
-    fn(&entry.second.state_index);
-}
-
 LexStateId LexTable::add_state() {
  states.push_back(LexState());
  return states.size() - 1;
@ -71,8 +66,4 @@ LexState &LexTable::state(LexStateId id) {
  return states[id];
 }

-bool LexTable::merge_state(size_t i, size_t j) {
-  return states[i] == states[j];
-}
-
 }  // namespace tree_sitter
--- a/src/compiler/lex_table.h
+++ b/src/compiler/lex_table.h
@ -54,7 +54,6 @@ class LexState {
  LexState();
  std::set<rules::CharacterSet> expected_inputs() const;
  bool operator==(const LexState &) const;
-  void each_referenced_state(std::function<void(LexStateId *)>);

  std::map<rules::CharacterSet, AdvanceAction> advance_actions;
  AcceptTokenAction accept_action;
@ -66,8 +65,6 @@ class LexTable {
  LexStateId add_state();
  LexState &state(LexStateId state_id);
  std::vector<LexState> states;
-
-  bool merge_state(size_t i, size_t j);
 };

 }  // namespace tree_sitter
--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@ -201,58 +201,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id,
  states[state_id].nonterminal_entries[lookahead] = next_state_id;
 }

-static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
-  for (const auto &pair : state.terminal_entries)
-    if (pair.second == entry)
-      return true;
-  return false;
-}
-
-bool ParseTable::merge_state(size_t i, size_t j) {
-  ParseState &state = states[i];
-  ParseState &other = states[j];
-
-  if (state.nonterminal_entries != other.nonterminal_entries)
-    return false;
-
-  for (auto &entry : state.terminal_entries) {
-    Symbol lookahead = entry.first;
-    const vector<ParseAction> &actions = entry.second.actions;
-
-    const auto &other_entry = other.terminal_entries.find(lookahead);
-    if (other_entry == other.terminal_entries.end()) {
-      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
-        return false;
-      if (actions.back().type != ParseActionTypeReduce)
-        return false;
-      if (!has_entry(other, entry.second))
-        return false;
-    } else if (entry.second != other_entry->second) {
-      return false;
-    }
-  }
-
-  set<Symbol> symbols_to_merge;
-
-  for (auto &entry : other.terminal_entries) {
-    Symbol lookahead = entry.first;
-    const vector<ParseAction> &actions = entry.second.actions;
-
-    if (!state.terminal_entries.count(lookahead)) {
-      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
-        return false;
-      if (actions.back().type != ParseActionTypeReduce)
-        return false;
-      if (!has_entry(state, entry.second))
-        return false;
-      symbols_to_merge.insert(lookahead);
-    }
-  }
-
-  for (const Symbol &lookahead : symbols_to_merge)
-    state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
-
-  return true;
-}
-
 }  // namespace tree_sitter
--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@ -93,7 +93,6 @@ class ParseTable {
  ParseStateId add_state();
  ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
  void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
-  bool merge_state(size_t i, size_t j);

  std::vector<ParseState> states;
  std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;