Avoid introducing certain lexical conflicts during parse state merging

The current pretty conservative approach is to avoid merging parse states which would cause a pair tokens to co-exist for the first time in any parse state, where the two tokens can start with the same character and at least one of the tokens can contain a character which is part of the grammar's separators.
2017-02-27 22:54:38 -08:00 · 2017-02-27 22:54:38 -08:00 · 686dc0997c
commit 686dc0997c
parent 3c8e6f9987
24 changed files with 305 additions and 158 deletions
--- a/project.gyp
+++ b/project.gyp
@ -25,6 +25,7 @@
        'src/compiler/compile.cc',
        'src/compiler/generate_code/c_code.cc',
        'src/compiler/lex_table.cc',
+        'src/compiler/lexical_grammar.cc',
        'src/compiler/parse_grammar.cc',
        'src/compiler/parse_table.cc',
        'src/compiler/precedence_range.cc',
--- a/spec/compiler/build_tables/compatible_tokens_spec.cc
+++ b/spec/compiler/build_tables/compatible_tokens_spec.cc
@ -14,17 +14,18 @@ START_TEST
 describe("recovery_tokens(rule)", []() {
  it("includes rules that can only begin and end with an explicit set of characters", [&]() {
    LexicalGrammar grammar;
+
    grammar.separators = {
      character({ ' ' }),
    };

    grammar.variables = {
-      Variable("var0", VariableTypeNamed, character({}, false)),
-      Variable("var1", VariableTypeNamed, seq({
+      LexicalVariable("var0", VariableTypeNamed, character({}, false), false),
+      LexicalVariable("var1", VariableTypeNamed, seq({
        character({ 'a', 'b' }),
        character({}, false),
        character({ 'c', 'd' }),
-      })),
+      }), false),
    };

    AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
--- a/spec/compiler/build_tables/lex_item_spec.cc
+++ b/spec/compiler/build_tables/lex_item_spec.cc
@ -13,11 +13,10 @@ START_TEST

 describe("LexItem", []() {
  describe("completion_status()", [&]() {
-    it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
+    it("indicates whether the item is done and its precedence", [&]() {
      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
      AssertThat(item1.completion_status().is_done, IsFalse());
      AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
-      AssertThat(item1.completion_status().is_string, IsFalse());

      MetadataParams params;
      params.precedence = 3;
@ -30,12 +29,10 @@ describe("LexItem", []() {

      AssertThat(item2.completion_status().is_done, IsTrue());
      AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
-      AssertThat(item2.completion_status().is_string, IsTrue());

      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
      AssertThat(item3.completion_status().is_done, IsTrue());
      AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
-      AssertThat(item3.completion_status().is_string, IsFalse());
    });
  });
 });
--- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc
+++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc
@ -12,12 +12,13 @@ using namespace rules;
 START_TEST

 describe("ParseItemSetBuilder", []() {
-  vector<Variable> lexical_variables;
+  vector<LexicalVariable> lexical_variables;
  for (size_t i = 0; i < 20; i++) {
-    lexical_variables.push_back(Variable{
+    lexical_variables.push_back({
      "token_" + to_string(i),
      VariableTypeNamed,
      blank(),
+      false
    });
  }

--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -15,89 +15,149 @@ describe("expand_tokens", []() {

  describe("string rules", [&]() {
    it("replaces strings with sequences of character sets", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          str("xyz"),
-          i_sym(11),
-        })),
-      }, {}};
+      LexicalGrammar grammar {
+        {
+          LexicalVariable {
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

      AssertThat(result.second, Equals(CompileError::none()));
-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          metadata(seq({
-            character({ 'x' }),
-            character({ 'y' }),
-            character({ 'z' }),
-          }), string_token_params),
-          i_sym(11),
-        })),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
+        LexicalVariable {
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            metadata(seq({
+              character({ 'x' }),
+              character({ 'y' }),
+              character({ 'z' }),
+            }), string_token_params),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
    });

    it("handles strings containing non-ASCII UTF8 characters", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
-      }, {}};
+      LexicalGrammar grammar {
+        {
+          LexicalVariable {
+            "rule_A",
+            VariableTypeNamed,
+            str("\u03B1 \u03B2"),
+            false
+          },
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, metadata(seq({
-          character({ 945 }),
-          character({ ' ' }),
-          character({ 946 }),
-        }), string_token_params)),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
+        LexicalVariable {
+          "rule_A",
+          VariableTypeNamed,
+          metadata(seq({
+            character({ 945 }),
+            character({ ' ' }),
+            character({ 946 }),
+          }), string_token_params),
+          false
+        }
+      }));
    });
  });

  describe("regexp rules", [&]() {
    it("replaces regexps with the equivalent rule tree", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          pattern("x*"),
-          i_sym(11),
-        })),
-      }, {}};
+      LexicalGrammar grammar {
+        {
+          LexicalVariable {
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

      AssertThat(result.second, Equals(CompileError::none()));
-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          repeat(character({ 'x' })),
-          i_sym(11),
-        })),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
+        LexicalVariable {
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            repeat(character({ 'x' })),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
    });

    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
-      }, {}};
+      LexicalGrammar grammar {
+        {
+          LexicalVariable {
+            "rule_A",
+            VariableTypeNamed,
+            pattern("[^\u03B1-\u03B4]*"),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
+        LexicalVariable {
+          "rule_A",
+          VariableTypeNamed,
+          repeat(character({ 945, 946, 947, 948 }, false)),
+          false
+        }
+      }));
    });

    it("returns an error when the grammar contains an invalid regex", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          pattern("("),
-          str("xyz"),
-          pattern("["),
-        }))
-      }, {}};
+      LexicalGrammar grammar {
+        {
+          LexicalVariable {
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              pattern("("),
+              str("xyz"),
+              pattern("["),
+            }),
+            false
+          },
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar;

 describe("extract_tokens", []() {
  it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
-    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, repeat1(seq({
-        str("ab"),
-        pattern("cd*"),
-        choice({
-          i_sym(1),
-          i_sym(2),
-          token(repeat1(choice({ str("ef"), str("gh") }))),
-        }),
-      }))),
-      Variable("rule_B", VariableTypeNamed, pattern("ij+")),
-      Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
-      Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
-    }, {}, {}, {}});
+    auto result = extract_tokens(InternedGrammar {
+      {
+        Variable("rule_A", VariableTypeNamed, repeat1(seq({
+          str("ab"),
+          pattern("cd*"),
+          choice({
+            i_sym(1),
+            i_sym(2),
+            token(repeat1(choice({ str("ef"), str("gh") }))),
+          }),
+        }))),
+        Variable("rule_B", VariableTypeNamed, pattern("ij+")),
+        Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
+        Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
+      },
+      {},
+      {},
+      {}
+    });

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);
@ -64,46 +69,51 @@ describe("extract_tokens", []() {
      Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))),
    })));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
      // Strings become anonymous rules.
-      Variable("ab", VariableTypeAnonymous, str("ab")),
+      LexicalVariable("ab", VariableTypeAnonymous, str("ab"), true),

      // Patterns become hidden rules.
-      Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
+      LexicalVariable("/cd*/", VariableTypeAuxiliary, pattern("cd*"), false),

      // Rules marked as tokens become hidden rules.
-      Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
+      LexicalVariable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
        str("ef"),
        str("gh")
-      }))),
+      })), false),

      // This named rule was moved wholesale to the lexical grammar.
-      Variable("rule_B", VariableTypeNamed, pattern("ij+")),
+      LexicalVariable("rule_B", VariableTypeNamed, pattern("ij+"), false),

      // Strings become anonymous rules.
-      Variable("kl", VariableTypeAnonymous, str("kl")),
+      LexicalVariable("kl", VariableTypeAnonymous, str("kl"), true),
    })));
  });

  it("does not create duplicate tokens in the lexical grammar", [&]() {
-    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, seq({
-        str("ab"),
-        i_sym(0),
-        str("ab"),
-      })),
-    }, {}, {}, {}});
+    auto result = extract_tokens(InternedGrammar {
+      {
+        Variable("rule_A", VariableTypeNamed, seq({
+          str("ab"),
+          i_sym(0),
+          str("ab"),
+        })),
+      },
+      {},
+      {},
+      {}
+    });

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);

-    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
-      Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
-    })));
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
+      Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
+    }));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
-      Variable("ab", VariableTypeAnonymous, str("ab")),
-    })))
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+    }))
  });

  it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
@ -122,11 +132,11 @@ describe("extract_tokens", []() {
      Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
    })));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
-      Variable("ab", VariableTypeAnonymous, str("ab")),
-      Variable("cd", VariableTypeAnonymous, str("cd")),
-      Variable("ef", VariableTypeAnonymous, str("ef")),
-    })));
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+      LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
+      LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
+    }));
  });

  it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
--- a/spec/helpers/rule_helpers.cc
+++ b/spec/helpers/rule_helpers.cc
@ -1,6 +1,8 @@
 #include "rule_helpers.h"
 #include <memory>
 #include "compiler/rules/symbol.h"
+#include "compiler/variable.h"
+#include "compiler/lexical_grammar.h"

 namespace tree_sitter {
  using std::make_shared;
@ -52,4 +54,9 @@ namespace tree_sitter {
    return left.name == right.name && left.rule->operator==(*right.rule) &&
      left.type == right.type;
  }
+
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
+    return left.name == right.name && left.rule->operator==(*right.rule) &&
+      left.type == right.type && left.is_string == right.is_string;
+  }
 }
--- a/spec/helpers/rule_helpers.h
+++ b/spec/helpers/rule_helpers.h
@ -15,7 +15,11 @@ namespace tree_sitter {
  rule_ptr i_token(size_t index);
  rule_ptr active_prec(int precedence, rule_ptr);

+  struct Variable;
+  struct LexicalVariable;
+
  bool operator==(const Variable &left, const Variable &right);
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right);
 }

 #endif  // HELPERS_RULE_HELPERS_H_
--- a/spec/helpers/stream_methods.cc
+++ b/spec/helpers/stream_methods.cc
@ -3,6 +3,7 @@
 #include "tree_sitter/compiler.h"
 #include "compiler/parse_table.h"
 #include "compiler/syntax_grammar.h"
+#include "compiler/lexical_grammar.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/lex_item.h"

@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
  return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
 }

+ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
+  return stream << "{" << variable.name << ", " << variable.rule << ", " <<
+    to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
+}
+
 std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
  return stream << string("#<advance ") + to_string(action.state_index) + ">";
 }
--- a/spec/helpers/stream_methods.h
+++ b/spec/helpers/stream_methods.h
@ -93,6 +93,7 @@ using std::string;
 using std::to_string;
 struct Variable;
 struct SyntaxVariable;
+struct LexicalVariable;
 struct AdvanceAction;
 struct AcceptTokenAction;
 class ParseAction;
@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &);
 ostream &operator<<(ostream &, const rule_ptr &);
 ostream &operator<<(ostream &, const Variable &);
 ostream &operator<<(ostream &, const SyntaxVariable &);
+ostream &operator<<(ostream &, const LexicalVariable &);
 ostream &operator<<(ostream &, const AdvanceAction &);
 ostream &operator<<(ostream &, const AcceptTokenAction &);
 ostream &operator<<(ostream &, const ParseAction &);
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -99,7 +99,8 @@ class LexTableBuilder {
      LexItem::CompletionStatus completion_status = item.completion_status();
      if (completion_status.is_done) {
        AcceptTokenAction action(item.lhs, completion_status.precedence.max,
-                                 completion_status.is_string);
+                                 item.lhs.is_built_in() ||
+                                 lex_grammar.variables[item.lhs.index].is_string);

        auto current_action = lex_table.state(state_id).accept_action;
        if (conflict_manager.resolve(action, current_action))
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -72,10 +72,11 @@ class ParseTableBuilder {
    }));

    CompileError error = process_part_state_queue();
-    if (error.type != TSCompileErrorTypeNone)
+    if (error.type != TSCompileErrorTypeNone) {
      return { parse_table, error };
+    }

-    parse_table.mergeable_symbols = compatible_tokens.recovery_tokens;
+    update_unmergable_token_pairs();

    build_error_parse_state();

@ -111,7 +112,7 @@ class ParseTableBuilder {
  void build_error_parse_state() {
    ParseState error_state;

-    for (const Symbol symbol : parse_table.mergeable_symbols) {
+    for (const Symbol symbol : compatible_tokens.recovery_tokens) {
      add_out_of_context_parse_state(&error_state, symbol);
    }

@ -292,6 +293,25 @@ class ParseTableBuilder {
    }
  }

+  void update_unmergable_token_pairs() {
+    for (const ParseState &state : parse_table.states) {
+      for (Symbol::Index token_index = 0, token_count = lexical_grammar.variables.size(); token_index < token_count; token_index++) {
+        Symbol token(token_index, Symbol::Terminal);
+        if (state.terminal_entries.count(token)) {
+          auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[token_index];
+          auto iter = incompatible_token_indices.begin();
+          while (iter != incompatible_token_indices.end()) {
+            if (state.terminal_entries.count(Symbol(*iter, Symbol::NonTerminal))) {
+              iter = incompatible_token_indices.erase(iter);
+            } else {
+              ++iter;
+            }
+          }
+        }
+      }
+    }
+  }
+
  void remove_duplicate_parse_states() {
    map<size_t, set<ParseStateId>> state_indices_by_signature;

@ -382,11 +402,19 @@ class ParseTableBuilder {
    for (auto &entry : state.terminal_entries) {
      Symbol lookahead = entry.first;
      const vector<ParseAction> &actions = entry.second.actions;
+      auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];

      const auto &other_entry = other.terminal_entries.find(lookahead);
      if (other_entry == other.terminal_entries.end()) {
-        if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
-          return false;
+        if (!lookahead.is_built_in()) {
+          if (!compatible_tokens.recovery_tokens.count(lookahead))
+            return false;
+          for (Symbol::Index incompatible_index : incompatible_token_indices) {
+            if (other.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
+              return false;
+            }
+          }
+        }
        if (actions.back().type != ParseActionTypeReduce)
          return false;
        if (!has_entry(other, entry.second))
@ -401,10 +429,18 @@ class ParseTableBuilder {
    for (auto &entry : other.terminal_entries) {
      Symbol lookahead = entry.first;
      const vector<ParseAction> &actions = entry.second.actions;
+      auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];

      if (!state.terminal_entries.count(lookahead)) {
-        if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
-          return false;
+        if (!lookahead.is_built_in()) {
+          if (!compatible_tokens.recovery_tokens.count(lookahead))
+            return false;
+          for (Symbol::Index incompatible_index : incompatible_token_indices) {
+            if (state.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
+              return false;
+            }
+          }
+        }
        if (actions.back().type != ParseActionTypeReduce)
          return false;
        if (!has_entry(state, entry.second))
@ -629,7 +665,7 @@ class ParseTableBuilder {

    switch (symbol.type) {
      case Symbol::Terminal: {
-        const Variable &variable = lexical_grammar.variables[symbol.index];
+        const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
        if (variable.type == VariableTypeNamed)
          return variable.name;
        else
--- a/src/compiler/build_tables/compatible_tokens.cc
+++ b/src/compiler/build_tables/compatible_tokens.cc
@ -83,6 +83,7 @@ using FirstCharactersIntersector = CharacterIntersector<true, false>;

 CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
  CompatibleTokensResult result;
+  result.unmergeable_pairs.resize(grammar.variables.size());

  AllCharacters all_separator_characters;
  for (const rule_ptr &separator : grammar.separators)
@ -90,7 +91,8 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {

  for (size_t i = 0; i < grammar.variables.size(); i++) {
    Symbol symbol(i, Symbol::Terminal);
-    rule_ptr rule = grammar.variables[i].rule;
+    const LexicalVariable &variable = grammar.variables[i];
+    rule_ptr rule = variable.rule;

    FirstCharacters first_characters;
    first_characters.apply(rule);
@ -109,18 +111,20 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
      !last_characters.result.includes_all &&
      !last_characters.result.intersects(all_separator_characters.result);

-    bool has_no_separators =
-      !all_characters.result.intersects(all_separator_characters.result);
+    bool has_separators =
+      all_characters.result.intersects(all_separator_characters.result);

-    if ((has_distinct_start && has_distinct_end) || has_no_separators)
+    if ((has_distinct_start && has_distinct_end) || !has_separators)
      result.recovery_tokens.insert(symbol);

-    for (size_t j = 0; j < grammar.variables.size(); j++) {
-      if (j == i) continue;
-      Symbol other_symbol(j, Symbol::Terminal);
-      FirstCharactersIntersector intersector(&first_characters.result);
-      if (intersector.apply(grammar.variables[j].rule)) {
-        result.unmergeable_pairs[symbol].insert(other_symbol);
+    for (size_t j = 0; j < i; j++) {
+      const LexicalVariable &other_variable = grammar.variables[j];
+      if (has_separators) {
+        FirstCharactersIntersector intersector(&first_characters.result);
+        if (intersector.apply(other_variable.rule)) {
+          result.unmergeable_pairs[i].insert(j);
+          result.unmergeable_pairs[j].insert(i);
+        }
      }
    }
  }
--- a/src/compiler/build_tables/compatible_tokens.h
+++ b/src/compiler/build_tables/compatible_tokens.h
@ -3,8 +3,9 @@

 #include "compiler/rule.h"
 #include "compiler/rules/symbol.h"
-#include <map>
+#include <vector>
 #include <set>
+#include <unordered_set>

 namespace tree_sitter {

@ -14,7 +15,7 @@ namespace build_tables {

 struct CompatibleTokensResult {
  std::set<rules::Symbol> recovery_tokens;
-  std::map<rules::Symbol, std::set<rules::Symbol>> unmergeable_pairs;
+  std::vector<std::unordered_set<rules::Symbol::Index>> unmergeable_pairs;
 };

 CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &);
--- a/src/compiler/build_tables/lex_item.cc
+++ b/src/compiler/build_tables/lex_item.cc
@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const {
    CompletionStatus apply_to(const rules::Choice *rule) {
      for (const auto &element : rule->elements) {
        CompletionStatus status = apply(element);
-        if (status.is_done)
-          return status;
+        if (status.is_done) return status;
      }
-      return { false, PrecedenceRange(), false };
+      return { false, PrecedenceRange() };
    }

    CompletionStatus apply_to(const rules::Metadata *rule) {
      CompletionStatus result = apply(rule->rule);
-      if (result.is_done) {
-        if (result.precedence.empty && rule->params.has_precedence)
-          result.precedence.add(rule->params.precedence);
-        if (rule->params.is_string)
-          result.is_string = true;
+      if (result.is_done && result.precedence.empty && rule->params.has_precedence) {
+        result.precedence.add(rule->params.precedence);
      }
      return result;
    }
@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const {
    }

    CompletionStatus apply_to(const rules::Blank *rule) {
-      return { true, PrecedenceRange(), false };
+      return { true, PrecedenceRange() };
    }

    CompletionStatus apply_to(const rules::Seq *rule) {
      CompletionStatus left_status = apply(rule->left);
-      if (left_status.is_done)
+      if (left_status.is_done) {
        return apply(rule->right);
-      else
-        return { false, PrecedenceRange(), false };
+      } else {
+        return { false, PrecedenceRange() };
+      }
    }
  };

@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const {

 LexItemSet::TransitionMap LexItemSet::transitions() const {
  TransitionMap result;
-  for (const LexItem &item : entries)
+  for (const LexItem &item : entries) {
    lex_item_transitions(&result, item);
+  }
  return result;
 }

--- a/src/compiler/build_tables/lex_item.h
+++ b/src/compiler/build_tables/lex_item.h
@ -19,7 +19,6 @@ class LexItem {
  struct CompletionStatus {
    bool is_done;
    PrecedenceRange precedence;
-    bool is_string;
  };

  bool operator==(const LexItem &other) const;
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -561,7 +561,7 @@ class CCodeGenerator {
        return { variable.name, variable.type };
      }
      case Symbol::Terminal: {
-        const Variable &variable = lexical_grammar.variables[symbol.index];
+        const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
        return { variable.name, variable.type };
      }
      case Symbol::External:
--- a/src/compiler/lexical_grammar.cc
+++ b/src/compiler/lexical_grammar.cc
@ -0,0 +1,11 @@
+#include "compiler/lexical_grammar.h"
+
+namespace tree_sitter {
+
+using std::string;
+
+LexicalVariable::LexicalVariable(
+  const string &name, VariableType type, const rule_ptr &rule, bool is_string)
+    : name(name), rule(rule), type(type), is_string(is_string) {}
+
+}  // namespace tree_sitter
--- a/src/compiler/lexical_grammar.h
+++ b/src/compiler/lexical_grammar.h
@ -9,8 +9,17 @@

 namespace tree_sitter {

+struct LexicalVariable {
+  LexicalVariable(const std::string &, VariableType, const rule_ptr &, bool);
+
+  std::string name;
+  rule_ptr rule;
+  VariableType type;
+  bool is_string;
+};
+
 struct LexicalGrammar {
-  std::vector<Variable> variables;
+  std::vector<LexicalVariable> variables;
  std::vector<rule_ptr> separators;
 };

--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@ -96,8 +96,6 @@ class ParseTable {

  std::vector<ParseState> states;
  std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
-
-  std::set<rules::Symbol> mergeable_symbols;
 };

 }  // namespace tree_sitter
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -67,11 +67,11 @@ pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar)
  LexicalGrammar result;
  ExpandTokens expander;

-  for (const Variable &variable : grammar.variables) {
+  for (const LexicalVariable &variable : grammar.variables) {
    auto rule = expander.apply(variable.rule);
    if (expander.error.type)
      return { result, expander.error };
-    result.variables.push_back(Variable(variable.name, variable.type, rule));
+    result.variables.push_back({variable.name, variable.type, rule, variable.is_string});
  }

  for (auto &sep : grammar.separators) {
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
 class TokenExtractor : public rules::IdentityRuleFn {
  using rules::IdentityRuleFn::apply_to;

-  rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
+  rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) {
    for (size_t i = 0; i < tokens.size(); i++)
      if (tokens[i].rule->operator==(*input)) {
        token_usage_counts[i]++;
@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn {

    rule_ptr rule = input->copy();
    size_t index = tokens.size();
-    tokens.push_back(Variable(token_description(rule), entry_type, rule));
+    tokens.push_back({token_description(rule), entry_type, rule, is_string});
    token_usage_counts.push_back(1);
    return make_shared<Symbol>(index, Symbol::Terminal);
  }

  rule_ptr apply_to(const rules::String *rule) {
-    return apply_to_token(rule, VariableTypeAnonymous);
+    return apply_to_token(rule, VariableTypeAnonymous, true);
  }

  rule_ptr apply_to(const rules::Pattern *rule) {
-    return apply_to_token(rule, VariableTypeAuxiliary);
+    return apply_to_token(rule, VariableTypeAuxiliary, false);
  }

  rule_ptr apply_to(const rules::Metadata *rule) {
-    if (rule->params.is_token)
-      return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
-    else
+    if (rule->params.is_token) {
+      return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false);
+    } else {
      return rules::IdentityRuleFn::apply_to(rule);
+    }
  }

 public:
  vector<size_t> token_usage_counts;
-  vector<Variable> tokens;
+  vector<LexicalVariable> tokens;
 };

 static CompileError extra_token_error(const string &message) {
@ -139,8 +140,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(

  for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
    ConflictSet new_conflict_set;
-    for (const Symbol &symbol : conflict_set)
+    for (const Symbol &symbol : conflict_set) {
      new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
+    }
    syntax_grammar.expected_conflicts.insert(new_conflict_set);
  }

@ -154,7 +156,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
  for (const rule_ptr &rule : grammar.extra_tokens) {
    int i = 0;
    bool used_elsewhere_in_grammar = false;
-    for (const Variable &variable : lexical_grammar.variables) {
+    for (const LexicalVariable &variable : lexical_grammar.variables) {
      if (variable.rule->operator==(*rule)) {
        syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
        used_elsewhere_in_grammar = true;
@ -171,9 +173,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    }

    auto symbol = rule->as<Symbol>();
-    if (!symbol)
+    if (!symbol) {
      return make_tuple(syntax_grammar, lexical_grammar,
                        extra_token_error(rule->to_string()));
+    }

    Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
    if (new_symbol.is_non_terminal()) {
--- a/src/compiler/prepare_grammar/normalize_rules.cc
+++ b/src/compiler/prepare_grammar/normalize_rules.cc
@ -8,7 +8,7 @@ namespace prepare_grammar {
 LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) {
  LexicalGrammar result(input_grammar);

-  for (Variable &variable : result.variables) {
+  for (LexicalVariable &variable : result.variables) {
    variable.rule = rules::Choice::build(extract_choices(variable.rule));
  }

--- a/src/compiler/syntax_grammar.cc
+++ b/src/compiler/syntax_grammar.cc
@ -8,10 +8,8 @@
 namespace tree_sitter {

 using std::string;
-using std::to_string;
 using std::pair;
 using std::vector;
-using std::set;

 SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
                               const vector<Production> &productions)