Merge pull request #63 from tree-sitter/avoid-lexing-issues-when-merging-states

Avoid introducing new lexical conflicts when merging parse states
2017-03-09 12:19:46 -08:00 · 2017-03-09 12:19:46 -08:00 · 352e678c12
commit 352e678c12
parent 8d3b72e1d9 56ec45729c
100 changed files with 2517 additions and 2168 deletions
--- a/project.gyp
+++ b/project.gyp
@ -11,13 +11,12 @@
        'externals/json-parser',
      ],
      'sources': [
-        'src/compiler/build_tables/build_lex_table.cc',
        'src/compiler/build_tables/build_parse_table.cc',
        'src/compiler/build_tables/build_tables.cc',
-        'src/compiler/build_tables/recovery_tokens.cc',
        'src/compiler/build_tables/lex_item.cc',
        'src/compiler/build_tables/lex_item_transitions.cc',
        'src/compiler/build_tables/lex_conflict_manager.cc',
+        'src/compiler/build_tables/lex_table_builder.cc',
        'src/compiler/build_tables/lookahead_set.cc',
        'src/compiler/build_tables/parse_item.cc',
        'src/compiler/build_tables/parse_item_set_builder.cc',
@ -41,7 +40,6 @@
        'src/compiler/prepare_grammar/token_description.cc',
        'src/compiler/rule.cc',
        'src/compiler/syntax_grammar.cc',
-        'src/compiler/variable.cc',
        'src/compiler/rules/blank.cc',
        'src/compiler/rules/built_in_symbols.cc',
        'src/compiler/rules/character_range.cc',
--- a/spec/compiler/build_tables/distinctive_tokens_spec.cc
+++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc
@ -1,34 +0,0 @@
-#include "spec_helper.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/build_tables/recovery_tokens.h"
-#include "compiler/lexical_grammar.h"
-#include "helpers/rule_helpers.h"
-#include "helpers/stream_methods.h"
-#include "compiler/rules.h"
-
-using namespace rules;
-using namespace build_tables;
-
-START_TEST
-
-describe("recovery_tokens(rule)", []() {
-  it("includes rules that can only begin and end with an explicit set of characters", [&]() {
-    LexicalGrammar grammar;
-    grammar.separators = {
-      character({ ' ' }),
-    };
-
-    grammar.variables = {
-      Variable("var0", VariableTypeNamed, character({}, false)),
-      Variable("var1", VariableTypeNamed, seq({
-        character({ 'a', 'b' }),
-        character({}, false),
-        character({ 'c', 'd' }),
-      })),
-    };
-
-    AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
-  });
-});
-
-END_TEST
--- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc
+++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc
@ -20,6 +20,10 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
  Symbol sym4(3, Symbol::Terminal);
  LexItemSet item_set({ LexItem(sym4, blank() )});

+  before_each([&]() {
+    conflict_manager = LexConflictManager();
+  });
+
  it("favors advance actions over empty accept token actions", [&]() {
    update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction());
    AssertThat(update, IsTrue());
@ -65,6 +69,7 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
  describe("advance/accept-token conflicts", [&]() {
    describe("when the token to accept has higher precedence", [&]() {
      it("prefers the accept-token action", [&]() {
+        AssertThat(conflict_manager.possible_extensions, IsEmpty());
        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
        AssertThat(update, IsFalse());
        AssertThat(conflict_manager.possible_extensions, IsEmpty());
@ -72,13 +77,9 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
    });

    describe("when the token to accept does not have a higher precedence", [&]() {
-      it("favors the advance action", [&]() {
+      it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
        AssertThat(update, IsTrue());
-      });
-
-      it("adds the in-progress tokens as possible extensions of the discarded token", [&]() {
-        conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
        AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
      });
    });
--- a/spec/compiler/build_tables/lex_item_spec.cc
+++ b/spec/compiler/build_tables/lex_item_spec.cc
@ -13,11 +13,10 @@ START_TEST

 describe("LexItem", []() {
  describe("completion_status()", [&]() {
-    it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
+    it("indicates whether the item is done and its precedence", [&]() {
      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
      AssertThat(item1.completion_status().is_done, IsFalse());
      AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
-      AssertThat(item1.completion_status().is_string, IsFalse());

      MetadataParams params;
      params.precedence = 3;
@ -30,12 +29,10 @@ describe("LexItem", []() {

      AssertThat(item2.completion_status().is_done, IsTrue());
      AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
-      AssertThat(item2.completion_status().is_string, IsTrue());

      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
      AssertThat(item3.completion_status().is_done, IsTrue());
      AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
-      AssertThat(item3.completion_status().is_string, IsFalse());
    });
  });
 });
--- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc
+++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc
@ -12,12 +12,13 @@ using namespace rules;
 START_TEST

 describe("ParseItemSetBuilder", []() {
-  vector<Variable> lexical_variables;
+  vector<LexicalVariable> lexical_variables;
  for (size_t i = 0; i < 20; i++) {
-    lexical_variables.push_back(Variable{
+    lexical_variables.push_back({
      "token_" + to_string(i),
      VariableTypeNamed,
      blank(),
+      false
    });
  }

@ -25,13 +26,13 @@ describe("ParseItemSetBuilder", []() {

  it("adds items at the beginnings of referenced rules", [&]() {
    SyntaxGrammar grammar{{
-      SyntaxVariable("rule0", VariableTypeNamed, {
+      SyntaxVariable{"rule0", VariableTypeNamed, {
        Production({
          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
        }),
-      }),
-      SyntaxVariable("rule1", VariableTypeNamed, {
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
        Production({
          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
@ -39,13 +40,13 @@ describe("ParseItemSetBuilder", []() {
        Production({
          {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
        })
-      }),
-      SyntaxVariable("rule2", VariableTypeNamed, {
+      }},
+      SyntaxVariable{"rule2", VariableTypeNamed, {
        Production({
          {Symbol(14, Symbol::Terminal), 0, AssociativityNone},
          {Symbol(15, Symbol::Terminal), 0, AssociativityNone},
        })
-      }),
+      }},
    }, {}, {}, {}};

    auto production = [&](int variable_index, int production_index) -> const Production & {
@ -84,19 +85,19 @@ describe("ParseItemSetBuilder", []() {

  it("handles rules with empty productions", [&]() {
    SyntaxGrammar grammar{{
-      SyntaxVariable("rule0", VariableTypeNamed, {
+      SyntaxVariable{"rule0", VariableTypeNamed, {
        Production({
          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
        }),
-      }),
-      SyntaxVariable("rule1", VariableTypeNamed, {
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
        Production({
          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
        }),
        Production({})
-      }),
+      }},
    }, {}, {}, {}};

    auto production = [&](int variable_index, int production_index) -> const Production & {
--- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc
@ -2,6 +2,7 @@
 #include "compiler/prepare_grammar/initial_syntax_grammar.h"
 #include "compiler/prepare_grammar/expand_repeats.h"
 #include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"

 START_TEST

@ -11,141 +12,159 @@ using prepare_grammar::expand_repeats;

 describe("expand_repeats", []() {
  it("replaces repeat rules with pairs of recursive rules", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, repeat1(i_token(0))),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(0))},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, i_sym(1)),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(1)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(1), i_token(0) }),
        i_token(0),
-      })),
-    })));
+      })},
+    }));
  });

  it("replaces repeats inside of sequences", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, seq({
-        i_token(10),
-        repeat1(i_token(11)),
-      })),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          i_token(10),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, seq({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
        i_token(10),
        i_sym(1),
-      })),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(1), i_token(11) }),
        i_token(11)
-      })),
-    })));
+      })},
+    }));
  });

  it("replaces repeats inside of choices", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, choice({
-        i_token(10),
-        repeat1(i_token(11))
-      })),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          i_token(10),
+          repeat1(i_token(11))
+        })},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, choice({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
        i_token(10),
        i_sym(1),
-      })),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(1), i_token(11) }),
        i_token(11),
-      })),
-    })));
+      })},
+    }));
  });

  it("does not create redundant auxiliary rules", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, choice({
-        seq({ i_token(1), repeat1(i_token(4)) }),
-        seq({ i_token(2), repeat1(i_token(4)) }),
-      })),
-      Variable("rule1", VariableTypeNamed, seq({
-        i_token(3),
-        repeat1(i_token(4))
-      })),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          seq({ i_token(1), repeat1(i_token(4)) }),
+          seq({ i_token(2), repeat1(i_token(4)) }),
+        })},
+        Variable{"rule1", VariableTypeNamed, seq({
+          i_token(3),
+          repeat1(i_token(4))
+        })},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, choice({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
        seq({ i_token(1), i_sym(2) }),
        seq({ i_token(2), i_sym(2) }),
-      })),
-      Variable("rule1", VariableTypeNamed, seq({
+      })},
+      Variable{"rule1", VariableTypeNamed, seq({
        i_token(3),
        i_sym(2),
-      })),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(2), i_token(4) }),
        i_token(4),
-      })),
-    })));
+      })},
+    }));
  });

  it("can replace multiple repeats in the same rule", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, seq({
-        repeat1(i_token(10)),
-        repeat1(i_token(11)),
-      })),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          repeat1(i_token(10)),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, seq({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
        i_sym(1),
        i_sym(2),
-      })),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(1), i_token(10) }),
        i_token(10),
-      })),
-      Variable("rule0_repeat2", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule0_repeat2", VariableTypeAuxiliary, choice({
        seq({ i_sym(2), i_token(11) }),
        i_token(11),
-      })),
-    })));
+      })},
+    }));
  });

  it("can replace repeats in multiple rules", [&]() {
-    InitialSyntaxGrammar grammar{{
-      Variable("rule0", VariableTypeNamed, repeat1(i_token(10))),
-      Variable("rule1", VariableTypeNamed, repeat1(i_token(11))),
-    }, {}, {}, {}};
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(10))},
+        Variable{"rule1", VariableTypeNamed, repeat1(i_token(11))},
+      },
+      {}, {}, {}
+    };

    auto result = expand_repeats(grammar);

-    AssertThat(result.variables, Equals(vector<Variable>({
-      Variable("rule0", VariableTypeNamed, i_sym(2)),
-      Variable("rule1", VariableTypeNamed, i_sym(3)),
-      Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(2)},
+      Variable{"rule1", VariableTypeNamed, i_sym(3)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(2), i_token(10) }),
        i_token(10),
-      })),
-      Variable("rule1_repeat1", VariableTypeAuxiliary, choice({
+      })},
+      Variable{"rule1_repeat1", VariableTypeAuxiliary, choice({
        seq({ i_sym(3), i_token(11) }),
        i_token(11),
-      })),
-    })));
+      })},
+    }));
  });
 });

--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -15,89 +15,149 @@ describe("expand_tokens", []() {

  describe("string rules", [&]() {
    it("replaces strings with sequences of character sets", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          str("xyz"),
-          i_sym(11),
-        })),
-      }, {}};
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

      AssertThat(result.second, Equals(CompileError::none()));
-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          metadata(seq({
-            character({ 'x' }),
-            character({ 'y' }),
-            character({ 'z' }),
-          }), string_token_params),
-          i_sym(11),
-        })),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            metadata(seq({
+              character({ 'x' }),
+              character({ 'y' }),
+              character({ 'z' }),
+            }), string_token_params),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
    });

    it("handles strings containing non-ASCII UTF8 characters", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
-      }, {}};
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            str("\u03B1 \u03B2"),
+            false
+          },
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, metadata(seq({
-          character({ 945 }),
-          character({ ' ' }),
-          character({ 946 }),
-        }), string_token_params)),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          metadata(seq({
+            character({ 945 }),
+            character({ ' ' }),
+            character({ 946 }),
+          }), string_token_params),
+          false
+        }
+      }));
    });
  });

  describe("regexp rules", [&]() {
    it("replaces regexps with the equivalent rule tree", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          pattern("x*"),
-          i_sym(11),
-        })),
-      }, {}};
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

      AssertThat(result.second, Equals(CompileError::none()));
-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, seq({
-          i_sym(10),
-          repeat(character({ 'x' })),
-          i_sym(11),
-        })),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            repeat(character({ 'x' })),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
    });

    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
-      }, {}};
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            pattern("[^\u03B1-\u03B4]*"),
+            false
+          }
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

-      AssertThat(result.first.variables, Equals(vector<Variable>({
-        Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
-      })));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          repeat(character({ 945, 946, 947, 948 }, false)),
+          false
+        }
+      }));
    });

    it("returns an error when the grammar contains an invalid regex", [&]() {
-      LexicalGrammar grammar{{
-        Variable("rule_A", VariableTypeNamed, seq({
-          pattern("("),
-          str("xyz"),
-          pattern("["),
-        }))
-      }, {}};
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              pattern("("),
+              str("xyz"),
+              pattern("["),
+            }),
+            false
+          },
+        },
+        {}
+      };

      auto result = expand_tokens(grammar);

--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar;

 describe("extract_tokens", []() {
  it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
-    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, repeat1(seq({
-        str("ab"),
-        pattern("cd*"),
-        choice({
-          i_sym(1),
-          i_sym(2),
-          token(repeat1(choice({ str("ef"), str("gh") }))),
-        }),
-      }))),
-      Variable("rule_B", VariableTypeNamed, pattern("ij+")),
-      Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
-      Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
-    }, {}, {}, {}});
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, repeat1(seq({
+          str("ab"),
+          pattern("cd*"),
+          choice({
+            i_sym(1),
+            i_sym(2),
+            token(repeat1(choice({ str("ef"), str("gh") }))),
+          }),
+        }))},
+        Variable{"rule_B", VariableTypeNamed, pattern("ij+")},
+        Variable{"rule_C", VariableTypeNamed, choice({ str("kl"), blank() })},
+        Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(3))},
+      },
+      {},
+      {},
+      {}
+    });

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);
@ -37,8 +42,8 @@ describe("extract_tokens", []() {

    AssertThat(error, Equals(CompileError::none()));

-    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
-      Variable("rule_A", VariableTypeNamed, repeat1(seq({
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable>{
+      Variable{"rule_A", VariableTypeNamed, repeat1(seq({

        // This string is now the first token in the lexical grammar.
        i_token(0),
@ -58,83 +63,88 @@ describe("extract_tokens", []() {
          // This token rule is now the third rule in the lexical grammar.
          i_token(2),
        }),
-      }))),
+      }))},

-      Variable("rule_C", VariableTypeNamed, choice({ i_token(4), blank() })),
-      Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))),
-    })));
+      Variable{"rule_C", VariableTypeNamed, choice({ i_token(4), blank() })},
+      Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(2))},
+    }));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
      // Strings become anonymous rules.
-      Variable("ab", VariableTypeAnonymous, str("ab")),
+      LexicalVariable{"ab", VariableTypeAnonymous, str("ab"), true},

      // Patterns become hidden rules.
-      Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
+      LexicalVariable{"/cd*/", VariableTypeAuxiliary, pattern("cd*"), false},

      // Rules marked as tokens become hidden rules.
-      Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
+      LexicalVariable{"/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
        str("ef"),
        str("gh")
-      }))),
+      })), false},

      // This named rule was moved wholesale to the lexical grammar.
-      Variable("rule_B", VariableTypeNamed, pattern("ij+")),
+      LexicalVariable{"rule_B", VariableTypeNamed, pattern("ij+"), false},

      // Strings become anonymous rules.
-      Variable("kl", VariableTypeAnonymous, str("kl")),
+      LexicalVariable{"kl", VariableTypeAnonymous, str("kl"), true},
    })));
  });

  it("does not create duplicate tokens in the lexical grammar", [&]() {
-    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, seq({
-        str("ab"),
-        i_sym(0),
-        str("ab"),
-      })),
-    }, {}, {}, {}});
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, seq({
+          str("ab"),
+          i_sym(0),
+          str("ab"),
+        })},
+      },
+      {},
+      {},
+      {}
+    });

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);

-    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
-      Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
-    })));
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
+      Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
+    }));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
-      Variable("ab", VariableTypeAnonymous, str("ab")),
-    })))
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+    }))
  });

  it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })),
-      Variable("rule_B", VariableTypeNamed, str("cd")),
-      Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })),
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })},
+      Variable{"rule_B", VariableTypeNamed, str("cd")},
+      Variable{"rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })},
    }, {}, {}, {}});

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);

    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
-      Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })),
-      Variable("rule_B", VariableTypeNamed, i_token(1)),
-      Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })},
+      Variable{"rule_B", VariableTypeNamed, i_token(1)},
+      Variable{"rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })},
    })));

-    AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
-      Variable("ab", VariableTypeAnonymous, str("ab")),
-      Variable("cd", VariableTypeAnonymous, str("cd")),
-      Variable("ef", VariableTypeAnonymous, str("ef")),
-    })));
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+      LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
+      LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
+    }));
  });

  it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
    auto result = extract_tokens(InternedGrammar{
      {
-        Variable("rule_A", VariableTypeNamed, str("ok")),
-        Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
-        Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
+        Variable{"rule_A", VariableTypeNamed, str("ok")},
+        Variable{"rule_B", VariableTypeNamed, repeat(i_sym(0))},
+        Variable{"rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))},
      },
      {
        str(" ")
@ -155,12 +165,17 @@ describe("extract_tokens", []() {

  describe("handling extra tokens", [&]() {
    it("adds inline extra tokens to the lexical grammar's separators", [&]() {
-      auto result = extract_tokens(InternedGrammar{{
-        Variable("rule_A", VariableTypeNamed, str("x")),
-      }, {
-        str("y"),
-        pattern("\\s+"),
-      }, {}, {}});
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+        },
+        {
+          str("y"),
+          pattern("\\s+"),
+        },
+        {},
+        {}
+      });

      AssertThat(get<2>(result), Equals(CompileError::none()));

@ -172,12 +187,17 @@ describe("extract_tokens", []() {
    });

    it("handles inline extra tokens that match tokens in the grammar", [&]() {
-      auto result = extract_tokens(InternedGrammar{{
-        Variable("rule_A", VariableTypeNamed, str("x")),
-        Variable("rule_B", VariableTypeNamed, str("y")),
-      }, {
-        str("y"),
-      }, {}, {}});
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+        },
+        {
+          str("y"),
+        },
+        {},
+        {}
+      });

      AssertThat(get<2>(result), Equals(CompileError::none()));
      AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
@ -185,13 +205,18 @@ describe("extract_tokens", []() {
    });

    it("updates extra symbols according to the new symbol numbers", [&]() {
-      auto result = extract_tokens(InternedGrammar{{
-        Variable("rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })),
-        Variable("rule_B", VariableTypeNamed, str("y")),
-        Variable("rule_C", VariableTypeNamed, str("z")),
-      }, {
-        i_sym(2),
-      }, {}, {}});
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+          Variable{"rule_C", VariableTypeNamed, str("z")},
+        },
+        {
+          i_sym(2),
+        },
+        {},
+        {}
+      });

      AssertThat(get<2>(result), Equals(CompileError::none()));

@ -204,8 +229,8 @@ describe("extract_tokens", []() {

    it("returns an error if any extra tokens are non-token symbols", [&]() {
      auto result = extract_tokens(InternedGrammar{{
-        Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
-        Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
      }, { i_sym(1) }, {}, {}});

      AssertThat(get<2>(result), !Equals(CompileError::none()));
@ -216,8 +241,8 @@ describe("extract_tokens", []() {

    it("returns an error if any extra tokens are non-token rules", [&]() {
      auto result = extract_tokens(InternedGrammar{{
-        Variable("rule_A", VariableTypeNamed, str("x")),
-        Variable("rule_B", VariableTypeNamed, str("y")),
+        Variable{"rule_A", VariableTypeNamed, str("x")},
+        Variable{"rule_B", VariableTypeNamed, str("y")},
      }, { choice({ i_sym(1), blank() }) }, {}, {}});

      AssertThat(get<2>(result), !Equals(CompileError::none()));
@ -231,8 +256,8 @@ describe("extract_tokens", []() {
  it("returns an error if an external token has the same name as a non-terminal rule", [&]() {
    auto result = extract_tokens(InternedGrammar{
      {
-        Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
-        Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
      },
      {},
      {},
--- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
+++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
@ -12,7 +12,7 @@ using prepare_grammar::flatten_rule;

 describe("flatten_grammar", []() {
  it("associates each symbol with the precedence and associativity binding it to its successor", [&]() {
-    SyntaxVariable result = flatten_rule(Variable(
+    SyntaxVariable result = flatten_rule(Variable{
      "test",
      VariableTypeNamed,
      seq({
@ -30,7 +30,7 @@ describe("flatten_grammar", []() {
        })),
        i_sym(7),
      })
-    ));
+    });

    AssertThat(result.name, Equals("test"));
    AssertThat(result.type, Equals(VariableTypeNamed));
@ -54,14 +54,14 @@ describe("flatten_grammar", []() {
  });

  it("uses the last assigned precedence", [&]() {
-    SyntaxVariable result = flatten_rule(Variable(
+    SyntaxVariable result = flatten_rule(Variable{
      "test1",
      VariableTypeNamed,
      prec_left(101, seq({
        i_sym(1),
        i_sym(2),
      }))
-    ));
+    });

    AssertThat(result.productions, Equals(vector<Production>({
      Production({
@ -70,13 +70,13 @@ describe("flatten_grammar", []() {
      })
    })))

-    result = flatten_rule(Variable(
+    result = flatten_rule(Variable{
      "test2",
      VariableTypeNamed,
      prec_left(101, seq({
        i_sym(1),
      }))
-    ));
+    });

    AssertThat(result.productions, Equals(vector<Production>({
      Production({
--- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc
+++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc
@ -15,27 +15,32 @@ using prepare_grammar::intern_symbols;

 describe("intern_symbols", []() {
  it("replaces named symbols with numerically-indexed symbols", [&]() {
-    Grammar grammar{{
-      { "x", choice({ sym("y"), sym("_z") }) },
-      { "y", sym("_z") },
-      { "_z", str("stuff") }
-    }, {}, {}, {}};
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("_z") })},
+        {"y", sym("_z")},
+        {"_z", str("stuff")}
+      }, {}, {}, {}
+    };

    auto result = intern_symbols(grammar);

    AssertThat(result.second, Equals(CompileError::none()));
-    AssertThat(result.first.variables, Equals(vector<Variable>({
-      Variable("x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })),
-      Variable("y", VariableTypeNamed, i_sym(2)),
-      Variable("_z", VariableTypeHidden, str("stuff")),
-    })));
+    AssertThat(result.first.variables, Equals(vector<Variable>{
+      Variable{"x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })},
+      Variable{"y", VariableTypeNamed, i_sym(2)},
+      Variable{"_z", VariableTypeHidden, str("stuff")},
+    }));
  });

  describe("when there are symbols that reference undefined rules", [&]() {
    it("returns an error", []() {
-      Grammar grammar{{
-        { "x", sym("y") },
-      }, {}, {}, {}};
+      Grammar grammar{
+        {
+          {"x", sym("y")},
+        },
+        {}, {}, {}
+      };

      auto result = intern_symbols(grammar);

@ -44,13 +49,17 @@ describe("intern_symbols", []() {
  });

  it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() {
-    Grammar grammar{{
-      { "x", choice({ sym("y"), sym("z") }) },
-      { "y", sym("z") },
-      { "z", str("stuff") }
-    }, {
-      sym("z")
-    }, {}, {}};
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")}
+      },
+      {
+        sym("z")
+      },
+      {}, {}
+    };

    auto result = intern_symbols(grammar);

@ -60,29 +69,34 @@ describe("intern_symbols", []() {
  });

  it("records any rule names that match external token names", [&]() {
-    Grammar grammar{{
-      { "x", choice({ sym("y"), sym("z") }) },
-      { "y", sym("z") },
-      { "z", str("stuff") }
-    }, {}, {}, {
-      "w",
-      "z"
-    }};
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")},
+      },
+      {},
+      {},
+      {
+        "w",
+        "z"
+      }
+    };

    auto result = intern_symbols(grammar);

-    AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>({
-      {
+    AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>{
+      ExternalToken{
        "w",
        VariableTypeNamed,
        rules::NONE()
      },
-      {
+      ExternalToken{
        "z",
        VariableTypeNamed,
        Symbol(2, Symbol::NonTerminal)
-      }
-    })))
+      },
+    }))
  });
 });

--- a/spec/fixtures/external_scanners/extra_external_tokens.c
+++ b/spec/fixtures/external_scanners/extra_external_tokens.c
@ -1,42 +0,0 @@
-#include <tree_sitter/parser.h>
-
-enum {
-  COMMENT,
-};
-
-void *tree_sitter_extra_external_tokens_external_scanner_create() {
-  return NULL;
-}
-
-void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) {
-}
-
-bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
-  return true;
-}
-
-void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
-}
-
-bool tree_sitter_extra_external_tokens_external_scanner_scan(
-  void *payload, TSLexer *lexer, const bool *whitelist) {
-
-  while (lexer->lookahead == ' ') {
-    lexer->advance(lexer, true);
-  }
-
-  if (lexer->lookahead == '#') {
-    lexer->advance(lexer, false);
-    while (lexer->lookahead != '\n') {
-      lexer->advance(lexer, false);
-    }
-
-    lexer->result_symbol = COMMENT;
-    return true;
-  }
-
-  return false;
-}
-
-void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) {
-}
--- a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt
+++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt
@ -0,0 +1,32 @@
+================================================
+anonymous tokens defined with character classes
+================================================
+1234
+---
+
+(first_rule)
+
+=================================================
+anonymous tokens defined with LF escape sequence
+=================================================
+
+
+---
+
+(first_rule)
+
+=================================================
+anonymous tokens defined with CR escape sequence
+=================================================
+
+---
+
+(first_rule)
+
+================================================
+anonymous tokens with quotes
+================================================
+'hello'
+---
+
+(first_rule)
--- a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json
+++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json
@ -0,0 +1,14 @@
+{
+  "name": "anonymous_tokens_with_escaped_chars",
+  "rules": {
+    "first_rule": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "STRING", "value": "\n"},
+        {"type": "STRING", "value": "\r"},
+        {"type": "STRING", "value": "'hello'"},
+        {"type": "PATTERN", "value": "\\d+"}
+      ]
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/associativity_left/corpus.txt
+++ b/spec/fixtures/test_grammars/associativity_left/corpus.txt
@ -0,0 +1,8 @@
+===================
+chained operations
+===================
+x+y+z
+---
+(expression (math_operation
+  (expression (math_operation (expression (identifier)) (expression (identifier))))
+  (expression (identifier))))
--- a/spec/fixtures/test_grammars/associativity_left/grammar.json
+++ b/spec/fixtures/test_grammars/associativity_left/grammar.json
@ -0,0 +1,31 @@
+{
+  "name": "associativity_left",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/associativity_missing/expected_error.txt
+++ b/spec/fixtures/test_grammars/associativity_missing/expected_error.txt
@ -0,0 +1,13 @@
+Unresolved conflict for symbol sequence:
+
+  expression  '+'  expression  •  '+'  …
+
+Possible interpretations:
+
+  1:  (math_operation  expression  '+'  expression)  •  '+'  …
+  2:  expression  '+'  (math_operation  expression  •  '+'  expression)
+
+Possible resolutions:
+
+  1:  Specify a left or right associativity in `math_operation`
+  2:  Add a conflict for these rules: `math_operation`
--- a/spec/fixtures/test_grammars/associativity_missing/grammar.json
+++ b/spec/fixtures/test_grammars/associativity_missing/grammar.json
@ -0,0 +1,27 @@
+{
+  "name": "associativity_missing",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "+"},
+        {"type": "SYMBOL", "name": "expression"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/associativity_right/corpus.txt
+++ b/spec/fixtures/test_grammars/associativity_right/corpus.txt
@ -0,0 +1,8 @@
+===================
+chained operations
+===================
+x+y+z
+---
+(expression (math_operation
+  (expression (identifier))
+  (expression (math_operation (expression (identifier)) (expression (identifier))))))
--- a/spec/fixtures/test_grammars/associativity_right/grammar.json
+++ b/spec/fixtures/test_grammars/associativity_right/grammar.json
@ -0,0 +1,31 @@
+{
+  "name": "associativity_right",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "PREC_RIGHT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt
+++ b/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt
@ -0,0 +1,15 @@
+Unresolved conflict for symbol sequence:
+
+  expression  '+'  expression  •  '*'  …
+
+Possible interpretations:
+
+  1:  (sum  expression  '+'  expression)  •  '*'  …
+  2:  expression  '+'  (product  expression  •  '*'  expression)
+  3:  expression  '+'  (other_thing  expression  •  '*'  '*')
+
+Possible resolutions:
+
+  1:  Specify a higher precedence in `product` and `other_thing` than in the other rules.
+  2:  Specify a higher precedence in `sum` than in the other rules.
+  3:  Add a conflict for these rules: `sum` `product` `other_thing`
--- a/spec/fixtures/test_grammars/conflicting_precedence/grammar.json
+++ b/spec/fixtures/test_grammars/conflicting_precedence/grammar.json
@ -0,0 +1,58 @@
+{
+  "name": "conflicting_precedence",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "product"},
+        {"type": "SYMBOL", "name": "other_thing"}
+      ]
+    },
+
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "product": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "other_thing": {
+      "type": "PREC_LEFT",
+      "value": -1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "STRING", "value": "*"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt
+++ b/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt
@ -0,0 +1,2 @@
+The rule `rule_2` matches the empty string.
+Tree-sitter currently does not support syntactic rules that match the empty string.
--- a/spec/fixtures/test_grammars/epsilon_rules/grammar.json
+++ b/spec/fixtures/test_grammars/epsilon_rules/grammar.json
@ -0,0 +1,15 @@
+{
+  "name": "epsilon_rules",
+
+  "rules": {
+    "rule_1": {"type": "SYMBOL", "name": "rule_2"},
+
+    "rule_2": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "rule_1"},
+        {"type": "BLANK"}
+      ]
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt
+++ b/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt
@ -0,0 +1,41 @@
+=========================================
+single-line statements - internal tokens
+=========================================
+
+a b
+
+---
+
+(statement (variable) (variable) (line_break))
+
+=========================================
+multi-line statements - internal tokens
+=========================================
+
+a
+b
+
+---
+
+(statement (variable) (variable) (line_break))
+
+=========================================
+single-line statements - external tokens
+=========================================
+
+'hello' 'world'
+
+---
+
+(statement (string) (string) (line_break))
+
+=========================================
+multi-line statements - external tokens
+=========================================
+
+'hello'
+'world'
+
+---
+
+(statement (string) (string) (line_break))
--- a/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json
+++ b/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json
@ -0,0 +1,36 @@
+{
+  "name": "external_and_internal_tokens",
+
+  "externals": [
+    "string",
+    "line_break"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "_expression"},
+        {"type": "SYMBOL", "name": "_expression"},
+        {"type": "SYMBOL", "name": "line_break"}
+      ]
+    },
+
+    "_expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "string"},
+        {"type": "SYMBOL", "name": "variable"},
+        {"type": "SYMBOL", "name": "number"}
+      ]
+    },
+
+    "variable": {"type": "PATTERN", "value": "\\a+"},
+    "number": {"type": "PATTERN", "value": "\\d+"},
+    "line_break": {"type": "STRING", "value": "\n"}
+  }
+}
--- a/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md
+++ b/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md
@ -0,0 +1 @@
+This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an *internal* token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field.
--- a/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c
+++ b/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c
@ -1,4 +1,3 @@
-#include <stdbool.h>
 #include <tree_sitter/parser.h>

 enum {
@ -6,21 +5,17 @@ enum {
  LINE_BREAK
 };

-void *tree_sitter_shared_external_tokens_external_scanner_create() {
-  return NULL;
-}
+void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; }

-void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) {
-}
+void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {}

-bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
-  return true;
-}
+void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {}

-void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
-}
+bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }

-bool tree_sitter_shared_external_tokens_external_scanner_scan(
+void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_and_internal_tokens_external_scanner_scan(
  void *payload, TSLexer *lexer, const bool *whitelist) {

  // If a line-break is a valid lookahead token, only skip spaces.
@ -58,6 +53,3 @@ bool tree_sitter_shared_external_tokens_external_scanner_scan(

  return false;
 }
-
-void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) {
-}
--- a/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt
+++ b/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt
@ -0,0 +1,10 @@
+========================
+extra external tokens
+========================
+
+x = # a comment
+y
+
+---
+
+(assignment (variable) (comment) (variable))
--- a/spec/fixtures/test_grammars/external_extra_tokens/grammar.json
+++ b/spec/fixtures/test_grammars/external_extra_tokens/grammar.json
@ -0,0 +1,25 @@
+{
+  "name": "external_extra_tokens",
+
+  "externals": [
+    "comment"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"},
+    {"type": "SYMBOL", "name": "comment"}
+  ],
+
+  "rules": {
+    "assignment": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "variable"},
+        {"type": "STRING", "value": "="},
+        {"type": "SYMBOL", "name": "variable"}
+      ]
+    },
+
+    "variable": {"type": "PATTERN", "value": "\\a+"}
+  }
+}
--- a/spec/fixtures/test_grammars/external_extra_tokens/scanner.c
+++ b/spec/fixtures/test_grammars/external_extra_tokens/scanner.c
@ -0,0 +1,36 @@
+#include <tree_sitter/parser.h>
+
+enum {
+  COMMENT,
+};
+
+void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; }
+
+void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {}
+
+void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {}
+
+bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
+
+void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_extra_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+
+  while (lexer->lookahead == ' ') {
+    lexer->advance(lexer, true);
+  }
+
+  if (lexer->lookahead == '#') {
+    lexer->advance(lexer, false);
+    while (lexer->lookahead != '\n') {
+      lexer->advance(lexer, false);
+    }
+
+    lexer->result_symbol = COMMENT;
+    return true;
+  }
+
+  return false;
+}
+
--- a/spec/fixtures/test_grammars/external_tokens/corpus.txt
+++ b/spec/fixtures/test_grammars/external_tokens/corpus.txt
@ -0,0 +1,22 @@
+========================
+simple external tokens
+=========================
+
+x + %(sup (external) scanner?)
+
+---
+
+(expression (sum (expression (identifier)) (expression (string))))
+
+==================================
+external tokens that require state
+==================================
+
+%{sup {} #{x + y} {} scanner?}
+
+---
+
+(expression (string
+  (expression (sum
+    (expression (identifier))
+    (expression (identifier))))))
--- a/spec/fixtures/test_grammars/external_tokens/grammar.json
+++ b/spec/fixtures/test_grammars/external_tokens/grammar.json
@ -0,0 +1,57 @@
+{
+  "name": "external_tokens",
+
+  "externals": [
+    "_percent_string",
+    "_percent_string_start",
+    "_percent_string_end"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "string"},
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "string": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "_percent_string"},
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "_percent_string_start"},
+            {"type": "SYMBOL", "name": "expression"},
+            {"type": "SYMBOL", "name": "_percent_string_end"}
+          ]
+        },
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "\\a+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/external_tokens/scanner.c
+++ b/spec/fixtures/test_grammars/external_tokens/scanner.c
@ -1,4 +1,3 @@
-#include <stdbool.h>
 #include <tree_sitter/parser.h>

 enum {
@ -13,7 +12,7 @@ typedef struct {
  uint32_t depth;
 } Scanner;

-void *tree_sitter_external_scanner_example_external_scanner_create() {
+void *tree_sitter_external_tokens_external_scanner_create() {
  Scanner *scanner = malloc(sizeof(Scanner));
  *scanner = (Scanner){
    .open_delimiter = 0,
@ -23,7 +22,17 @@ void *tree_sitter_external_scanner_example_external_scanner_create() {
  return scanner;
 }

-bool tree_sitter_external_scanner_example_external_scanner_scan(
+void tree_sitter_external_tokens_external_scanner_destroy(void *payload) {
+  free(payload);
+}
+
+void tree_sitter_external_tokens_external_scanner_reset(void *payload) {}
+
+bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
+
+void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_tokens_external_scanner_scan(
  void *payload, TSLexer *lexer, const bool *whitelist) {
  Scanner *scanner = payload;

@ -103,16 +112,3 @@ bool tree_sitter_external_scanner_example_external_scanner_scan(
  return false;
 }

-void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) {
-}
-
-bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) {
-  return true;
-}
-
-void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
-}
-
-void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) {
-  free(payload);
-}
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
@ -0,0 +1,33 @@
+========================
+regexes
+========================
+
+/a+/
+
+---
+
+(expression (regex))
+
+========================
+conditionals
+========================
+
+(if (1) /a+/)
+
+---
+
+(expression (parenthesized (expression (conditional
+  (parenthesized (expression (number)))
+  (expression (regex))))))
+
+========================
+quotients
+========================
+
+((1) / 2)
+
+---
+
+(expression (parenthesized (expression (quotient
+  (expression (parenthesized (expression (number))))
+  (expression (number))))))
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
@ -0,0 +1,65 @@
+{
+  "name": "lexical_conflicts_due_to_state_merging",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "conditional"},
+        {"type": "SYMBOL", "name": "regex"},
+        {"type": "SYMBOL", "name": "quotient"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "parenthesized"}
+      ]
+    },
+
+    "conditional": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "STRING", "value": "if"},
+          {"type": "SYMBOL", "name": "parenthesized"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "quotient": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "/"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "regex": {
+      "type": "PATTERN",
+      "value": "/[^/\n]+/"
+    },
+
+    "number": {
+      "type": "PATTERN",
+      "value": "\\d+"
+    },
+
+    "parenthesized": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "("},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": ")"}
+      ]
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
@ -0,0 +1,20 @@
+This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem.
+
+When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state.
+
+If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state:
+
+```
+(if (1) /\w+/)
+       ^
+```
+
+and a `/` is valid in this state:
+
+
+```
+((1) / 2)
+    ^
+```
+
+And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`.
--- a/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt
@ -0,0 +1,15 @@
+Unresolved conflict for symbol sequence:
+
+  identifier  •  '{'  …
+
+Possible interpretations:
+
+  1:  (expression  identifier)  •  '{'  …
+  2:  (function_call  identifier  •  block)
+
+Possible resolutions:
+
+  1:  Specify a higher precedence in `function_call` than in the other rules.
+  2:  Specify a higher precedence in `expression` than in the other rules.
+  3:  Specify a left or right associativity in `expression`
+  4:  Add a conflict for these rules: `expression` `function_call`
--- a/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_missing",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": 0,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md
@ -0,0 +1,14 @@
+This language has function calls similar to Ruby's, with no parentheses required, and optional blocks.
+
+There is a shift/reduce conflict here:
+
+```
+foo bar { baz }
+       ^
+```
+
+The possible actions are:
+1. `reduce(expression, 1)` - `bar` is an expression being passed to the `foo` function.
+2. `shift` - `bar` is a function being called with the block `{ baz }`
+
+The grammars `precedence_on_single_child_negative` and `precedence_on_single_child_positive` show possible resolutions to this conflict.
--- a/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt
@ -0,0 +1,12 @@
+===========================
+function calls with blocks
+===========================
+
+foo bar { baz }
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (identifier))
+  (block (expression (identifier)))))
--- a/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_negative",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": -1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md
@ -0,0 +1 @@
+This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`.
--- a/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt
@ -0,0 +1,13 @@
+===========================
+function calls with blocks
+===========================
+
+foo bar { baz }
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (function_call
+    (identifier)
+    (block (expression (identifier)))))))
--- a/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_positive",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": 1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md
+++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md
@ -0,0 +1 @@
+This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a positive precedence. This causes shifting the `{` token as part of `function_call` to be preferred over reducing the `bar` variable to an expression.
--- a/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt
+++ b/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt
@ -0,0 +1,24 @@
+==========================================
+curly brace blocks with high precedence
+==========================================
+
+a b {}
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (function_call (identifier) (block)))))
+
+==========================================
+do blocks with low precedence
+==========================================
+
+a b do end
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (identifier))
+  (do_block)))
--- a/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json
+++ b/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json
@ -0,0 +1,135 @@
+{
+  "name": "precedence_on_subsequence",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {"type": "SYMBOL", "name": "function_call"},
+          {"type": "SYMBOL", "name": "identifier"},
+          {"type": "SYMBOL", "name": "scope_resolution"}
+        ]
+      }
+    },
+
+    "function_call": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {"type": "SYMBOL", "name": "expression"}
+          ]
+        },
+
+        {
+          "type": "PREC",
+          "value": 1,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        },
+
+        {
+          "type": "PREC",
+          "value": -1,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "do_block"}
+            ]
+          }
+        },
+
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {
+              "type": "PREC",
+              "value": 1,
+              "content": {
+                "type": "SEQ",
+                "members": [
+                  {"type": "SYMBOL", "name": "expression"},
+                  {"type": "SYMBOL", "name": "block"}
+                ]
+              }
+            }
+          ]
+        },
+
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {
+              "type": "PREC",
+              "value": -1,
+              "content": {
+                "type": "SEQ",
+                "members": [
+                  {"type": "SYMBOL", "name": "expression"},
+                  {"type": "SYMBOL", "name": "do_block"}
+                ]
+              }
+            }
+          ]
+        }
+      ]
+    },
+
+    "scope_resolution": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "STRING", "value": "::"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "STRING", "value": "::"},
+              {"type": "SYMBOL", "name": "expression"},
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "STRING",
+      "value": "{}"
+    },
+
+    "do_block": {
+      "type": "STRING",
+      "value": "do end"
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/readme.md
+++ b/spec/fixtures/test_grammars/readme.md
@ -0,0 +1,3 @@
+These small grammars demonstrate specific features or test for certain specific regressions.
+
+For some of them, compilation is expected to fail with a given error message. For others, the resulting parser is expected to produce certain trees.
--- a/spec/fixtures/test_grammars/readme_grammar/corpus.txt
+++ b/spec/fixtures/test_grammars/readme_grammar/corpus.txt
@ -0,0 +1,13 @@
+==================================
+the readme example
+==================================
+
+a + b * c
+
+---
+
+(expression (sum
+  (expression (variable))
+  (expression (product
+     (expression (variable))
+     (expression (variable))))))
--- a/spec/fixtures/test_grammars/readme_grammar/grammar.json
+++ b/spec/fixtures/test_grammars/readme_grammar/grammar.json
@ -0,0 +1,67 @@
+{
+  "name": "readme_grammar",
+
+  // Things that can appear anywhere in the language, like comments
+  // and whitespace, are expressed as 'extras'.
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"},
+    {"type": "SYMBOL", "name": "comment"}
+  ],
+
+  "rules": {
+
+    // The first rule listed in the grammar becomes the 'start rule'.
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "product"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "variable"},
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "STRING", "value": "("},
+            {"type": "SYMBOL", "name": "expression"},
+            {"type": "STRING", "value": ")"}
+          ]
+        }
+      ]
+    },
+
+    // Tokens like '+' and '*' are described directly within the
+    // grammar's rules, as opposed to in a seperate lexer description.
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    // Ambiguities can be resolved at compile time by assigning precedence
+    // values to rule subtrees.
+    "product": {
+      "type": "PREC_LEFT",
+      "value": 2,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    // Tokens can be specified using ECMAScript regexps.
+    "number": {"type": "PATTERN", "value": "\\d+"},
+    "comment": {"type": "PATTERN", "value": "#.*"},
+    "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}
+  }
+}
--- a/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt
+++ b/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt
@ -0,0 +1,7 @@
+========================
+the empty string
+=======================
+
+---
+
+(first_rule)
--- a/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json
+++ b/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json
@ -0,0 +1,6 @@
+{
+  "name": "start_rule_is_blank",
+  "rules": {
+    "first_rule": {"type": "BLANK"}
+  }
+}
--- a/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt
+++ b/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt
@ -0,0 +1,6 @@
+===========================
+the single token
+==========================
+the-value
+---
+(first_rule)
--- a/spec/fixtures/test_grammars/start_rule_is_token/grammar.json
+++ b/spec/fixtures/test_grammars/start_rule_is_token/grammar.json
@ -0,0 +1,6 @@
+{
+  "name": "start_rule_is_token",
+  "rules": {
+    "first_rule": {"type": "STRING", "value": "the-value"}
+  }
+}
--- a/spec/helpers/file_helpers.cc
+++ b/spec/helpers/file_helpers.cc
@ -0,0 +1,61 @@
+#include "helpers/file_helpers.h"
+#include <sys/stat.h>
+#include <errno.h>
+#include <fstream>
+#include <dirent.h>
+
+using std::string;
+using std::ifstream;
+using std::istreambuf_iterator;
+using std::ofstream;
+using std::vector;
+
+bool file_exists(const string &path) {
+  struct stat file_stat;
+  return stat(path.c_str(), &file_stat) == 0;
+}
+
+int get_modified_time(const string &path) {
+  struct stat file_stat;
+  if (stat(path.c_str(), &file_stat) != 0) {
+    if (errno != ENOENT)
+      fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str());
+    return 0;
+  }
+  return file_stat.st_mtime;
+}
+
+string read_file(const string &path) {
+  ifstream file(path);
+  istreambuf_iterator<char> file_iterator(file), end_iterator;
+  string content(file_iterator, end_iterator);
+  file.close();
+  return content;
+}
+
+void write_file(const string &path, const string &content) {
+  ofstream file(path);
+  file << content;
+  file.close();
+}
+
+vector<string> list_directory(const string &path) {
+  vector<string> result;
+
+  DIR *dir = opendir(path.c_str());
+  if (!dir) {
+    printf("\nTest error - no such directory '%s'", path.c_str());
+    return result;
+  }
+
+  struct dirent *dir_entry;
+  while ((dir_entry = readdir(dir))) {
+    string name(dir_entry->d_name);
+    if (name != "." && name != "..") {
+      result.push_back(name);
+    }
+  }
+
+  closedir(dir);
+  return result;
+}
--- a/spec/helpers/file_helpers.h
+++ b/spec/helpers/file_helpers.h
@ -0,0 +1,14 @@
+#ifndef HELPERS_FILE_HELPERS_H_
+#define HELPERS_FILE_HELPERS_H_
+
+#include <string>
+#include <vector>
+#include <sys/stat.h>
+
+bool file_exists(const std::string &path);
+int get_modified_time(const std::string &path);
+std::string read_file(const std::string &path);
+void write_file(const std::string &path, const std::string &content);
+std::vector<std::string> list_directory(const std::string &path);
+
+#endif  // HELPERS_FILE_HELPERS_H_
--- a/spec/helpers/load_language.cc
+++ b/spec/helpers/load_language.cc
@ -1,12 +1,12 @@
 #include "spec_helper.h"
 #include "helpers/load_language.h"
+#include "helpers/file_helpers.h"
 #include <unistd.h>
 #include <dlfcn.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <map>
 #include <string>
-#include <sys/stat.h>
 #include <fstream>
 #include <stdlib.h>
 #include "tree_sitter/compiler.h"
@ -54,25 +54,10 @@ static std::string run_command(const char *cmd, const char *args[]) {
  }
 }

-static bool file_exists(const string &path) {
-  struct stat file_stat;
-  return stat(path.c_str(), &file_stat) == 0;
-}
-
-static int get_modified_time(const string &path) {
-  struct stat file_stat;
-  if (stat(path.c_str(), &file_stat) != 0) {
-    if (errno != ENOENT)
-      fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str());
-    return 0;
-  }
-  return file_stat.st_mtime;
-}
-
-const TSLanguage *load_language(const string &source_filename,
-                                const string &lib_filename,
-                                const string &language_name,
-                                string external_scanner_filename = "") {
+static const TSLanguage *load_language(const string &source_filename,
+                                       const string &lib_filename,
+                                       const string &language_name,
+                                       string external_scanner_filename = "") {
  string language_function_name = "tree_sitter_" + language_name;
  string header_dir = getenv("PWD") + string("/include");
  int source_mtime = get_modified_time(source_filename);
@ -132,9 +117,9 @@ const TSLanguage *load_language(const string &source_filename,
  return reinterpret_cast<TSLanguage *(*)()>(language_function)();
 }

-const TSLanguage *load_compile_result(const string &name,
-                                      const TSCompileResult &compile_result,
-                                      string external_scanner_path) {
+const TSLanguage *load_test_language(const string &name,
+                                     const TSCompileResult &compile_result,
+                                     string external_scanner_path) {
  if (compile_result.error_type != TSCompileErrorTypeNone) {
    Assert::Failure(string("Compilation failed ") + compile_result.error_message);
    return nullptr;
@ -155,7 +140,7 @@ const TSLanguage *load_compile_result(const string &name,
  return language;
 }

-const TSLanguage *get_test_language(const string &language_name) {
+const TSLanguage *load_real_language(const string &language_name) {
  if (loaded_languages[language_name])
    return loaded_languages[language_name];

@ -182,20 +167,14 @@ const TSLanguage *get_test_language(const string &language_name) {
  if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) {
    printf("\n" "Regenerating the %s parser...\n", language_name.c_str());

-    ifstream grammar_file(grammar_filename);
-    istreambuf_iterator<char> grammar_file_iterator(grammar_file), end_iterator;
-    string grammar_json(grammar_file_iterator, end_iterator);
-    grammar_file.close();
-
+    string grammar_json = read_file(grammar_filename);
    TSCompileResult result = ts_compile_grammar(grammar_json.c_str());
    if (result.error_type != TSCompileErrorTypeNone) {
      fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message);
      return nullptr;
    }

-    ofstream parser_file(parser_filename);
-    parser_file << result.code;
-    parser_file.close();
+    write_file(parser_filename, result.code);
  }

  mkdir("out/tmp", 0777);
--- a/spec/helpers/load_language.h
+++ b/spec/helpers/load_language.h
@ -5,8 +5,10 @@
 #include "tree_sitter/runtime.h"
 #include <string>

-const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &,
-                                      std::string external_scanner_path = "");
-const TSLanguage *get_test_language(const std::string &language_name);
+const TSLanguage *load_real_language(const std::string &name);
+
+const TSLanguage *load_test_language(const std::string &name,
+                                     const TSCompileResult &compile_result,
+                                     std::string external_scanner_path = "");

 #endif  // HELPERS_LOAD_LANGUAGE_H_
--- a/spec/helpers/read_test_entries.cc
+++ b/spec/helpers/read_test_entries.cc
@ -1,20 +1,18 @@
 #include "helpers/read_test_entries.h"
+#include <assert.h>
 #include <string>
-#include <fstream>
-#include <streambuf>
-#include <dirent.h>
-
 #include <regex>
+#include "helpers/file_helpers.h"
+
 using std::regex;
 using std::regex_search;
 using std::regex_replace;
-using std::smatch;
 using std::regex_constants::extended;
-
+using std::smatch;
 using std::string;
 using std::vector;
-using std::ifstream;
-using std::istreambuf_iterator;
+
+string fixtures_dir = "spec/fixtures/";

 static string trim_output(const string &input) {
  string result(input);
@ -27,7 +25,7 @@ static string trim_output(const string &input) {

 static vector<TestEntry> parse_test_entries(string content) {
  regex header_pattern("===+\n"  "([^=]+)\n"  "===+\n", extended);
-  regex separator_pattern("---+\n", extended);
+  regex separator_pattern("---+\r?\n", extended);
  vector<string> descriptions;
  vector<string> bodies;

@ -55,51 +53,42 @@ static vector<TestEntry> parse_test_entries(string content) {
        body.substr(0, matches.position() - 1),
        trim_output(body.substr(matches.position() + matches[0].length()))
      });
+    } else {
+      puts(("Invalid corpus entry with description: " + descriptions[i]).c_str());
+      abort();
    }
  }

  return result;
 }

-static vector<string> list_directory(string dir_name) {
-  vector<string> result;
-
-  DIR *dir = opendir(dir_name.c_str());
-  if (!dir) {
-    printf("\nTest error - no such directory '%s'", dir_name.c_str());
-    return result;
-  }
-
-  struct dirent *dir_entry;
-  while ((dir_entry = readdir(dir))) {
-    string name(dir_entry->d_name);
-    if (name != "." && name != "..")
-      result.push_back(dir_name + "/" + name);
-  }
-
-  closedir(dir);
-  return result;
-}
-
-static string read_file(string filename) {
-  ifstream file(filename);
-  string result((istreambuf_iterator<char>(file)), istreambuf_iterator<char>());
-  return result;
-}
-
-vector<TestEntry> read_corpus_entries(string language_name) {
+vector<TestEntry> read_real_language_corpus(string language_name) {
  vector<TestEntry> result;

-  string fixtures_dir = "spec/fixtures/";
-
  string test_directory = fixtures_dir + "grammars/" + language_name + "/grammar_test";
-  for (string &test_filename : list_directory(test_directory))
-    for (TestEntry &entry : parse_test_entries(read_file(test_filename)))
+  for (string &test_filename : list_directory(test_directory)) {
+    for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
      result.push_back(entry);
+    }
+  }

  string error_test_filename = fixtures_dir + "/error_corpus/" + language_name + "_errors.txt";
-  for (TestEntry &entry : parse_test_entries(read_file(error_test_filename)))
+  for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) {
    result.push_back(entry);
+  }

  return result;
 }
+
+vector<TestEntry> read_test_language_corpus(string language_name) {
+  vector<TestEntry> result;
+
+  string test_directory = fixtures_dir + "test_grammars/" + language_name;
+  for (string &test_filename : list_directory(test_directory)) {
+    for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
+      result.push_back(entry);
+    }
+  }
+
+  return result;
+}
--- a/spec/helpers/read_test_entries.h
+++ b/spec/helpers/read_test_entries.h
@ -10,6 +10,7 @@ struct TestEntry {
 	std::string tree_string;
 };

-std::vector<TestEntry> read_corpus_entries(std::string directory);
+std::vector<TestEntry> read_real_language_corpus(std::string name);
+std::vector<TestEntry> read_test_language_corpus(std::string name);

 #endif
--- a/spec/helpers/rule_helpers.cc
+++ b/spec/helpers/rule_helpers.cc
@ -1,6 +1,8 @@
 #include "rule_helpers.h"
 #include <memory>
 #include "compiler/rules/symbol.h"
+#include "compiler/variable.h"
+#include "compiler/lexical_grammar.h"

 namespace tree_sitter {
  using std::make_shared;
@ -52,4 +54,9 @@ namespace tree_sitter {
    return left.name == right.name && left.rule->operator==(*right.rule) &&
      left.type == right.type;
  }
+
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
+    return left.name == right.name && left.rule->operator==(*right.rule) &&
+      left.type == right.type && left.is_string == right.is_string;
+  }
 }
--- a/spec/helpers/rule_helpers.h
+++ b/spec/helpers/rule_helpers.h
@ -15,7 +15,11 @@ namespace tree_sitter {
  rule_ptr i_token(size_t index);
  rule_ptr active_prec(int precedence, rule_ptr);

+  struct Variable;
+  struct LexicalVariable;
+
  bool operator==(const Variable &left, const Variable &right);
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right);
 }

 #endif  // HELPERS_RULE_HELPERS_H_
--- a/spec/helpers/stream_methods.cc
+++ b/spec/helpers/stream_methods.cc
@ -3,6 +3,7 @@
 #include "tree_sitter/compiler.h"
 #include "compiler/parse_table.h"
 #include "compiler/syntax_grammar.h"
+#include "compiler/lexical_grammar.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/lex_item.h"

@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
  return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
 }

+ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
+  return stream << "{" << variable.name << ", " << variable.rule << ", " <<
+    to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
+}
+
 std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
  return stream << string("#<advance ") + to_string(action.state_index) + ">";
 }
--- a/spec/helpers/stream_methods.h
+++ b/spec/helpers/stream_methods.h
@ -93,10 +93,11 @@ using std::string;
 using std::to_string;
 struct Variable;
 struct SyntaxVariable;
+struct LexicalVariable;
 struct AdvanceAction;
 struct AcceptTokenAction;
-class ParseAction;
-class ParseState;
+struct ParseAction;
+struct ParseState;
 struct ExternalToken;
 struct ProductionStep;
 struct PrecedenceRange;
@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &);
 ostream &operator<<(ostream &, const rule_ptr &);
 ostream &operator<<(ostream &, const Variable &);
 ostream &operator<<(ostream &, const SyntaxVariable &);
+ostream &operator<<(ostream &, const LexicalVariable &);
 ostream &operator<<(ostream &, const AdvanceAction &);
 ostream &operator<<(ostream &, const AcceptTokenAction &);
 ostream &operator<<(ostream &, const ParseAction &);
@ -119,8 +121,8 @@ namespace build_tables {

 class LexItem;
 class LexItemSet;
-class ParseItem;
-class ParseItemSet;
+struct ParseItem;
+struct ParseItemSet;
 class LookaheadSet;

 ostream &operator<<(ostream &, const LexItem &);
--- a/spec/integration/compile_grammar_spec.cc
+++ b/spec/integration/compile_grammar_spec.cc
@ -1,847 +0,0 @@
-#include "spec_helper.h"
-#include "runtime/alloc.h"
-#include "helpers/load_language.h"
-#include "helpers/stderr_logger.h"
-#include "helpers/dedent.h"
-#include "compiler/util/string_helpers.h"
-#include <map>
-
-static string fill_template(string input, map<string, string> parameters) {
-  string result = input;
-  for (const auto &pair : parameters) {
-    util::str_replace(&result, "{{" + pair.first + "}}", pair.second);
-  }
-  return result;
-}
-
-START_TEST
-
-describe("compile_grammar", []() {
-  TSDocument *document;
-
-  before_each([&]() {
-    document = ts_document_new();
-  });
-
-  after_each([&]() {
-    ts_document_free(document);
-  });
-
-  auto assert_root_node = [&](const string &expected_string) {
-    TSNode root_node = ts_document_root_node(document);
-    char *node_string = ts_node_string(root_node, document);
-    AssertThat(node_string, Equals(expected_string));
-    ts_free(node_string);
-  };
-
-  describe("conflicts", [&]() {
-    it("can resolve shift/reduce conflicts using associativities", [&]() {
-      string grammar_template = R"JSON({
-        "name": "associativity_example",
-
-        "rules": {
-          "expression": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "math_operation"},
-              {"type": "SYMBOL", "name": "identifier"}
-            ]
-          },
-
-          "math_operation": {
-            "type": "{{math_operation_prec_type}}",
-            "value": 0,
-            "content": {
-              "type": "SEQ",
-              "members": [
-                {"type": "SYMBOL", "name": "expression"},
-                {"type": "STRING", "value": "+"},
-                {"type": "SYMBOL", "name": "expression"}
-              ]
-            }
-          },
-
-          "identifier": {
-            "type": "PATTERN",
-            "value": "[a-zA-Z]+"
-          }
-        }
-      })JSON";
-
-      // Ambiguity, which '+' applies first?
-      ts_document_set_input_string(document, "x+y+z");
-
-      TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
-        {"math_operation_prec_type", "PREC"}
-      }).c_str());
-
-      AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
-        Unresolved conflict for symbol sequence:
-
-          expression  '+'  expression  •  '+'  …
-
-        Possible interpretations:
-
-          1:  (math_operation  expression  '+'  expression)  •  '+'  …
-          2:  expression  '+'  (math_operation  expression  •  '+'  expression)
-
-        Possible resolutions:
-
-          1:  Specify a left or right associativity in `math_operation`
-          2:  Add a conflict for these rules: `math_operation`
-      )MESSAGE")));
-
-      result = ts_compile_grammar(fill_template(grammar_template, {
-        {"math_operation_prec_type", "PREC_LEFT"}
-      }).c_str());
-
-      ts_document_set_language(document, load_compile_result("associativity_example", result));
-      ts_document_parse(document);
-      assert_root_node("(expression (math_operation "
-        "(expression (math_operation (expression (identifier)) (expression (identifier)))) "
-        "(expression (identifier))))");
-
-      result = ts_compile_grammar(fill_template(grammar_template, {
-        {"math_operation_prec_type", "PREC_RIGHT"}
-      }).c_str());
-
-      ts_document_set_language(document, load_compile_result("associativity_example", result));
-      ts_document_parse(document);
-      assert_root_node("(expression (math_operation "
-        "(expression (identifier)) "
-        "(expression (math_operation (expression (identifier)) (expression (identifier))))))");
-    });
-
-    it("can resolve shift/reduce conflicts involving single-child rules using precedence", [&]() {
-      string grammar_template = R"JSON({
-        "name": "associativity_example",
-
-        "extras": [
-          {"type": "PATTERN", "value": "\\s"}
-        ],
-
-        "rules": {
-          "expression": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "function_call"},
-              {"type": "SYMBOL", "name": "identifier"}
-            ]
-          },
-
-          "function_call": {
-            "type": "PREC_RIGHT",
-            "value": {{function_call_precedence}},
-            "content": {
-              "type": "CHOICE",
-              "members": [
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "identifier"},
-                    {"type": "SYMBOL", "name": "expression"}
-                  ]
-                },
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "identifier"},
-                    {"type": "SYMBOL", "name": "block"}
-                  ]
-                },
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "identifier"},
-                    {"type": "SYMBOL", "name": "expression"},
-                    {"type": "SYMBOL", "name": "block"}
-                  ]
-                }
-              ]
-            }
-          },
-
-          "block": {
-            "type": "SEQ",
-            "members": [
-              {"type": "STRING", "value": "{"},
-              {"type": "SYMBOL", "name": "expression"},
-              {"type": "STRING", "value": "}"}
-            ]
-          },
-
-          "identifier": {
-            "type": "PATTERN",
-            "value": "[a-zA-Z]+"
-          }
-        }
-      })JSON";
-
-      // Ambiguity: is the trailing block associated with `bar` or `foo`?
-      ts_document_set_input_string(document, "foo bar { baz }");
-
-      TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
-        {"function_call_precedence", "0"}
-      }).c_str());
-
-      AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
-        Unresolved conflict for symbol sequence:
-
-          identifier  •  '{'  …
-
-        Possible interpretations:
-
-          1:  (expression  identifier)  •  '{'  …
-          2:  (function_call  identifier  •  block)
-
-        Possible resolutions:
-
-          1:  Specify a higher precedence in `function_call` than in the other rules.
-          2:  Specify a higher precedence in `expression` than in the other rules.
-          3:  Specify a left or right associativity in `expression`
-          4:  Add a conflict for these rules: `expression` `function_call`
-      )MESSAGE")));
-
-      // Giving function calls lower precedence than expressions causes `bar`
-      // to be treated as an expression passed to `foo`, not as a function
-      // that's being called with a block.
-      result = ts_compile_grammar(fill_template(grammar_template, {
-        {"function_call_precedence", "-1"}
-      }).c_str());
-
-      AssertThat(result.error_message, IsNull());
-      ts_document_set_language(document, load_compile_result("associativity_example", result));
-      ts_document_parse(document);
-      assert_root_node("(expression (function_call "
-        "(identifier) "
-        "(expression (identifier)) "
-        "(block (expression (identifier)))))");
-
-      // Giving function calls higher precedence than expressions causes `bar`
-      // to be treated as a function that's being called with a block, not as
-      // an expression passed to `foo`.
-      result = ts_compile_grammar(fill_template(grammar_template, {
-        {"function_call_precedence", "1"}
-      }).c_str());
-
-      AssertThat(result.error_message, IsNull());
-      ts_document_set_language(document, load_compile_result("associativity_example", result));
-      ts_document_set_input_string(document, "foo bar { baz }");
-      ts_document_parse(document);
-      assert_root_node("(expression (function_call "
-        "(identifier) "
-        "(expression (function_call "
-          "(identifier) "
-          "(block (expression (identifier)))))))");
-    });
-
-    it("handles precedence applied to specific rule subsequences (regression)", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON({
-        "name": "precedence_on_subsequence",
-
-        "extras": [
-          {"type": "STRING", "value": " "}
-        ],
-
-        "rules": {
-          "expression": {
-            "type": "PREC_LEFT",
-            "value": 0,
-            "content": {
-              "type": "CHOICE",
-              "members": [
-                {"type": "SYMBOL", "name": "function_call"},
-                {"type": "SYMBOL", "name": "identifier"},
-                {"type": "SYMBOL", "name": "scope_resolution"}
-              ]
-            }
-          },
-
-          "function_call": {
-            "type": "CHOICE",
-            "members": [
-              {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "identifier"},
-                  {"type": "SYMBOL", "name": "expression"}
-                ]
-              },
-
-              {
-                "type": "PREC",
-                "value": 1,
-                "content": {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "identifier"},
-                    {"type": "SYMBOL", "name": "block"}
-                  ]
-                }
-              },
-
-              {
-                "type": "PREC",
-                "value": -1,
-                "content": {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "identifier"},
-                    {"type": "SYMBOL", "name": "do_block"}
-                  ]
-                }
-              },
-
-              {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "identifier"},
-                  {
-                    "type": "PREC",
-                    "value": 1,
-                    "content": {
-                      "type": "SEQ",
-                      "members": [
-                        {"type": "SYMBOL", "name": "expression"},
-                        {"type": "SYMBOL", "name": "block"}
-                      ]
-                    }
-                  }
-                ]
-              },
-
-              {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "identifier"},
-                  {
-                    "type": "PREC",
-                    "value": -1,
-                    "content": {
-                      "type": "SEQ",
-                      "members": [
-                        {"type": "SYMBOL", "name": "expression"},
-                        {"type": "SYMBOL", "name": "do_block"}
-                      ]
-                    }
-                  }
-                ]
-              }
-            ]
-          },
-
-          "scope_resolution": {
-            "type": "PREC_LEFT",
-            "value": 1,
-            "content": {
-              "type": "CHOICE",
-              "members": [
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "SYMBOL", "name": "expression"},
-                    {"type": "STRING", "value": "::"},
-                    {"type": "SYMBOL", "name": "expression"}
-                  ]
-                },
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "STRING", "value": "::"},
-                    {"type": "SYMBOL", "name": "expression"},
-                  ]
-                }
-              ]
-            }
-          },
-
-          "block": {
-            "type": "STRING",
-            "value": "{}"
-          },
-
-          "do_block": {
-            "type": "STRING",
-            "value": "do end"
-          },
-
-          "identifier": {
-            "type": "PATTERN",
-            "value": "[a-zA-Z]+"
-          }
-        }
-      })JSON");
-
-      auto language = load_compile_result("precedence_on_subsequence", result);
-      ts_document_set_language(document, language);
-
-      ts_document_set_input_string(document, "a b {}");
-      ts_document_parse(document);
-      assert_root_node("(expression (function_call "
-        "(identifier) "
-        "(expression (function_call (identifier) (block)))))");
-
-      ts_document_set_input_string(document, "a b do end");
-      ts_document_parse(document);
-      assert_root_node("(expression (function_call "
-        "(identifier) "
-        "(expression (identifier)) "
-        "(do_block)))");
-    });
-
-    it("does not allow conflicting precedences", [&]() {
-      string grammar_template = R"JSON({
-        "name": "conflicting_precedence_example",
-
-        "rules": {
-          "expression": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "sum"},
-              {"type": "SYMBOL", "name": "product"},
-              {"type": "SYMBOL", "name": "other_thing"}
-            ]
-          },
-
-          "sum": {
-            "type": "PREC_LEFT",
-            "value": 0,
-            "content": {
-              "type": "SEQ",
-              "members": [
-                {"type": "SYMBOL", "name": "expression"},
-                {"type": "STRING", "value": "+"},
-                {"type": "SYMBOL", "name": "expression"}
-              ]
-            }
-          },
-
-          "product": {
-            "type": "PREC_LEFT",
-            "value": 1,
-            "content": {
-              "type": "SEQ",
-              "members": [
-                {"type": "SYMBOL", "name": "expression"},
-                {"type": "STRING", "value": "*"},
-                {"type": "SYMBOL", "name": "expression"}
-              ]
-            }
-          },
-
-          "other_thing": {
-            "type": "PREC_LEFT",
-            "value": -1,
-            "content": {
-              "type": "SEQ",
-              "members": [
-                {"type": "SYMBOL", "name": "expression"},
-                {"type": "STRING", "value": "*"},
-                {"type": "STRING", "value": "*"}
-              ]
-            }
-          },
-
-          "identifier": {
-            "type": "PATTERN",
-            "value": "[a-zA-Z]+"
-          }
-        }
-      })JSON";
-
-      TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
-      }).c_str());
-
-      AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
-        Unresolved conflict for symbol sequence:
-
-          expression  '+'  expression  •  '*'  …
-
-        Possible interpretations:
-
-          1:  (sum  expression  '+'  expression)  •  '*'  …
-          2:  expression  '+'  (product  expression  •  '*'  expression)
-          3:  expression  '+'  (other_thing  expression  •  '*'  '*')
-
-        Possible resolutions:
-
-          1:  Specify a higher precedence in `product` and `other_thing` than in the other rules.
-          2:  Specify a higher precedence in `sum` than in the other rules.
-          3:  Add a conflict for these rules: `sum` `product` `other_thing`
-      )MESSAGE")));
-    });
-  });
-
-  describe("when the grammar contains rules that match the empty string", [&]() {
-    it("reports an error", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON(
-        {
-          "name": "empty_rules",
-
-          "rules": {
-            "rule_1": {"type": "SYMBOL", "name": "rule_2"},
-
-            "rule_2": {
-              "type": "CHOICE",
-              "members": [
-                {"type": "SYMBOL", "name": "rule_1"},
-                {"type": "BLANK"}
-              ]
-            }
-          }
-        }
-      )JSON");
-
-      AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
-        The rule `rule_2` matches the empty string.
-        Tree-sitter currently does not support syntactic rules that match the empty string.
-      )MESSAGE")));
-    });
-  });
-
-  describe("external scanners", [&]() {
-    it("can tokenize using arbitrary user-defined scanner functions", [&]() {
-      string grammar = R"JSON({
-        "name": "external_scanner_example",
-
-        "externals": [
-          "_percent_string",
-          "_percent_string_start",
-          "_percent_string_end"
-        ],
-
-        "extras": [
-          {"type": "PATTERN", "value": "\\s"}
-        ],
-
-        "rules": {
-          "expression": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "string"},
-              {"type": "SYMBOL", "name": "sum"},
-              {"type": "SYMBOL", "name": "identifier"}
-            ]
-          },
-
-          "sum": {
-            "type": "PREC_LEFT",
-            "value": 0,
-            "content": {
-              "type": "SEQ",
-              "members": [
-                {"type": "SYMBOL", "name": "expression"},
-                {"type": "STRING", "value": "+"},
-                {"type": "SYMBOL", "name": "expression"}
-              ]
-            }
-          },
-
-          "string": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "_percent_string"},
-              {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "_percent_string_start"},
-                  {"type": "SYMBOL", "name": "expression"},
-                  {"type": "SYMBOL", "name": "_percent_string_end"}
-                ]
-              },
-            ]
-          },
-
-          "identifier": {
-            "type": "PATTERN",
-            "value": "\\a+"
-          }
-        }
-      })JSON";
-
-      TSCompileResult result = ts_compile_grammar(grammar.c_str());
-      AssertThat(result.error_message, IsNull());
-
-      ts_document_set_language(document, load_compile_result(
-        "external_scanner_example",
-        result,
-        "spec/fixtures/external_scanners/percent_strings.c"
-      ));
-
-      ts_document_set_input_string(document, "x + %(sup (external) scanner?)");
-      ts_document_parse(document);
-      assert_root_node("(expression (sum (expression (identifier)) (expression (string))))");
-
-      ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}");
-      ts_document_parse(document);
-      assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))");
-    });
-
-    it("allows external scanners to refer to tokens that are defined internally", [&]() {
-      string grammar = R"JSON({
-        "name": "shared_external_tokens",
-
-        "externals": [
-          "string",
-          "line_break"
-        ],
-
-        "extras": [
-          {"type": "PATTERN", "value": "\\s"}
-        ],
-
-        "rules": {
-          "statement": {
-            "type": "SEQ",
-            "members": [
-              {"type": "SYMBOL", "name": "_expression"},
-              {"type": "SYMBOL", "name": "_expression"},
-              {"type": "SYMBOL", "name": "line_break"}
-            ]
-          },
-
-          "_expression": {
-            "type": "CHOICE",
-            "members": [
-              {"type": "SYMBOL", "name": "string"},
-              {"type": "SYMBOL", "name": "variable"},
-              {"type": "SYMBOL", "name": "number"}
-            ]
-          },
-
-          "variable": {"type": "PATTERN", "value": "\\a+"},
-          "number": {"type": "PATTERN", "value": "\\d+"},
-          "line_break": {"type": "STRING", "value": "\n"}
-        }
-      })JSON";
-
-      TSCompileResult result = ts_compile_grammar(grammar.c_str());
-      AssertThat(result.error_message, IsNull());
-
-      ts_document_set_language(document, load_compile_result(
-        "shared_external_tokens",
-        result,
-        "spec/fixtures/external_scanners/shared_external_tokens.c"
-      ));
-
-      ts_document_set_input_string(document, "a b\n");
-      ts_document_parse(document);
-      assert_root_node("(statement (variable) (variable) (line_break))");
-
-      ts_document_set_input_string(document, "a \nb\n");
-      ts_document_parse(document);
-      assert_root_node("(statement (variable) (variable) (line_break))");
-
-      ts_document_set_input_string(document, "'hello' 'world'\n");
-      ts_document_parse(document);
-      assert_root_node("(statement (string) (string) (line_break))");
-
-      ts_document_set_input_string(document, "'hello' \n'world'\n");
-      ts_document_parse(document);
-      assert_root_node("(statement (string) (string) (line_break))");
-    });
-
-    it("allows external tokens to be used as extras", [&]() {
-      string grammar = R"JSON({
-        "name": "extra_external_tokens",
-
-        "externals": [
-          "comment"
-        ],
-
-        "extras": [
-          {"type": "PATTERN", "value": "\\s"},
-          {"type": "SYMBOL", "name": "comment"}
-        ],
-
-        "rules": {
-          "assignment": {
-            "type": "SEQ",
-            "members": [
-              {"type": "SYMBOL", "name": "variable"},
-              {"type": "STRING", "value": "="},
-              {"type": "SYMBOL", "name": "variable"}
-            ]
-          },
-
-          "variable": {"type": "PATTERN", "value": "\\a+"}
-        }
-      })JSON";
-
-      TSCompileResult result = ts_compile_grammar(grammar.c_str());
-      AssertThat(result.error_message, IsNull());
-
-      ts_document_set_language(document, load_compile_result(
-        "extra_external_tokens",
-        result,
-        "spec/fixtures/external_scanners/extra_external_tokens.c"
-      ));
-
-      ts_document_set_input_string(document, "x = # a comment\n y");
-      ts_document_parse(document);
-      assert_root_node("(assignment (variable) (comment) (variable))");
-    });
-  });
-
-  describe("when the grammar's start symbol is a token", [&]() {
-    it("parses the token", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON(
-        {
-          "name": "one_token_language",
-          "rules": {
-            "first_rule": {"type": "STRING", "value": "the-value"}
-          }
-        }
-      )JSON");
-
-      ts_document_set_language(document, load_compile_result("one_token_language", result));
-
-      ts_document_set_input_string(document, "the-value");
-      ts_document_parse(document);
-      assert_root_node("(first_rule)");
-    });
-  });
-
-  describe("when the grammar's start symbol is blank", [&]() {
-    it("parses the empty string", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON(
-        {
-          "name": "blank_language",
-          "rules": {
-            "first_rule": {"type": "BLANK"}
-          }
-        }
-      )JSON");
-
-      ts_document_set_language(document, load_compile_result("blank_language", result));
-
-      ts_document_set_input_string(document, "");
-      ts_document_parse(document);
-      assert_root_node("(first_rule)");
-    });
-  });
-
-  describe("when the grammar contains anonymous tokens with escaped characters", [&]() {
-    it("escapes the escaped characters properly in the generated parser", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON(
-        {
-          "name": "escaped_char_language",
-          "rules": {
-            "first_rule": {
-              "type": "CHOICE",
-              "members": [
-                {"type": "STRING", "value": "\n"},
-                {"type": "STRING", "value": "\r"},
-                {"type": "STRING", "value": "'hello'"},
-                {"type": "PATTERN", "value": "\\d+"}
-              ]
-            }
-          }
-        }
-      )JSON");
-
-      ts_document_set_language(document, load_compile_result("escaped_char_language", result));
-
-      ts_document_set_input_string(document, "1234");
-      ts_document_parse(document);
-      assert_root_node("(first_rule)");
-
-      ts_document_set_input_string(document, "\n");
-      ts_document_parse(document);
-      assert_root_node("(first_rule)");
-
-      ts_document_set_input_string(document, "'hello'");
-      ts_document_parse(document);
-      assert_root_node("(first_rule)");
-    });
-  });
-
-  describe("the grammar in the README", [&]() {
-    it("parses the input in the README", [&]() {
-      TSCompileResult result = ts_compile_grammar(R"JSON(
-        {
-          "name": "arithmetic",
-
-          // Things that can appear anywhere in the language, like comments
-          // and whitespace, are expressed as 'extras'.
-          "extras": [
-            {"type": "PATTERN", "value": "\\s"},
-            {"type": "SYMBOL", "name": "comment"}
-          ],
-
-          "rules": {
-
-            // The first rule listed in the grammar becomes the 'start rule'.
-            "expression": {
-              "type": "CHOICE",
-              "members": [
-                {"type": "SYMBOL", "name": "sum"},
-                {"type": "SYMBOL", "name": "product"},
-                {"type": "SYMBOL", "name": "number"},
-                {"type": "SYMBOL", "name": "variable"},
-                {
-                  "type": "SEQ",
-                  "members": [
-                    {"type": "STRING", "value": "("},
-                    {"type": "SYMBOL", "name": "expression"},
-                    {"type": "STRING", "value": ")"}
-                  ]
-                }
-              ]
-            },
-
-            // Tokens like '+' and '*' are described directly within the
-            // grammar's rules, as opposed to in a seperate lexer description.
-            "sum": {
-              "type": "PREC_LEFT",
-              "value": 1,
-              "content": {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "expression"},
-                  {"type": "STRING", "value": "+"},
-                  {"type": "SYMBOL", "name": "expression"}
-                ]
-              }
-            },
-
-            // Ambiguities can be resolved at compile time by assigning precedence
-            // values to rule subtrees.
-            "product": {
-              "type": "PREC_LEFT",
-              "value": 2,
-              "content": {
-                "type": "SEQ",
-                "members": [
-                  {"type": "SYMBOL", "name": "expression"},
-                  {"type": "STRING", "value": "*"},
-                  {"type": "SYMBOL", "name": "expression"}
-                ]
-              }
-            },
-
-            // Tokens can be specified using ECMAScript regexps.
-            "number": {"type": "PATTERN", "value": "\\d+"},
-            "comment": {"type": "PATTERN", "value": "#.*"},
-            "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}
-          }
-        }
-      )JSON");
-
-      const TSLanguage *language = load_compile_result("arithmetic", result);
-
-      ts_document_set_language(document, language);
-      ts_document_set_input_string(document, "a + b * c");
-      ts_document_parse(document);
-
-      assert_root_node(
-        "(expression (sum "
-          "(expression (variable)) "
-          "(expression (product "
-             "(expression (variable)) "
-             "(expression (variable))))))");
-    });
-  });
-});
-
-END_TEST
--- a/spec/integration/corpus_specs.cc
+++ b/spec/integration/corpus_specs.cc
@ -1,185 +0,0 @@
-#include "spec_helper.h"
-#include "runtime/alloc.h"
-#include "helpers/load_language.h"
-#include "helpers/read_test_entries.h"
-#include "helpers/spy_input.h"
-#include "helpers/stderr_logger.h"
-#include "helpers/point_helpers.h"
-#include "helpers/encoding_helpers.h"
-#include "helpers/record_alloc.h"
-#include "helpers/random_helpers.h"
-#include "helpers/scope_sequence.h"
-#include <set>
-
-static void assert_correct_tree_shape(const TSDocument *document, string tree_string) {
-  TSNode root_node = ts_document_root_node(document);
-  const char *node_string = ts_node_string(root_node, document);
-  string result(node_string);
-  ts_free((void *)node_string);
-  AssertThat(result, Equals(tree_string));
-}
-
-static void assert_consistent_sizes(TSNode node) {
-  size_t child_count = ts_node_child_count(node);
-  size_t start_byte = ts_node_start_byte(node);
-  size_t end_byte = ts_node_end_byte(node);
-  TSPoint start_point = ts_node_start_point(node);
-  TSPoint end_point = ts_node_end_point(node);
-  bool some_child_has_changes = false;
-
-  AssertThat(start_byte, !IsGreaterThan(end_byte));
-  AssertThat(start_point, !IsGreaterThan(end_point));
-
-  size_t last_child_end_byte = start_byte;
-  TSPoint last_child_end_point = start_point;
-
-  for (size_t i = 0; i < child_count; i++) {
-    TSNode child = ts_node_child(node, i);
-    size_t child_start_byte = ts_node_start_byte(child);
-    TSPoint child_start_point = ts_node_start_point(child);
-
-    AssertThat(child_start_byte, !IsLessThan(last_child_end_byte));
-    AssertThat(child_start_point, !IsLessThan(last_child_end_point));
-    assert_consistent_sizes(child);
-    if (ts_node_has_changes(child))
-      some_child_has_changes = true;
-
-    last_child_end_byte = ts_node_end_byte(child);
-    last_child_end_point = ts_node_end_point(child);
-  }
-
-  if (child_count > 0) {
-    AssertThat(end_byte, !IsLessThan(last_child_end_byte));
-    AssertThat(end_point, !IsLessThan(last_child_end_point));
-  }
-
-  if (some_child_has_changes) {
-    AssertThat(ts_node_has_changes(node), IsTrue());
-  }
-}
-
-static void assert_correct_tree_size(TSDocument *document, string content) {
-  TSNode root_node = ts_document_root_node(document);
-  size_t expected_size = content.size();
-
-  // In the JSON grammar, the start rule (`_value`) is hidden, so the node
-  // returned from `ts_document_root_node` (e.g. an `object` node), does not
-  // actually point to the root of the tree. In this weird case, trailing
-  // whitespace is not included in the root node's size.
-  //
-  // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden?
-  if (ts_document_language(document) == get_test_language("json") &&
-      string(ts_node_type(root_node, document)) != "ERROR")
-    expected_size = content.find_last_not_of("\n ") + 1;
-
-  AssertThat(ts_node_end_byte(root_node), Equals(expected_size));
-  assert_consistent_sizes(root_node);
-}
-
-START_TEST
-
-describe("The Corpus", []() {
-  vector<string> test_languages({
-    "javascript",
-    "json",
-    "c",
-    "cpp",
-    "python",
-  });
-
-  for (auto &language_name : test_languages) {
-    describe(("the " + language_name + " language").c_str(), [&]() {
-      TSDocument *document;
-
-      before_each([&]() {
-        record_alloc::start();
-        document = ts_document_new();
-        ts_document_set_language(document, get_test_language(language_name));
-
-        // ts_document_set_logger(document, stderr_logger_new(true));
-        // ts_document_print_debugging_graphs(document, true);
-      });
-
-      after_each([&]() {
-        ts_document_free(document);
-        AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
-      });
-
-      for (auto &entry : read_corpus_entries(language_name)) {
-        SpyInput *input;
-
-        auto it_handles_edit_sequence = [&](string name, std::function<void()> edit_sequence){
-          it(("parses " + entry.description + ": " + name).c_str(), [&]() {
-            input = new SpyInput(entry.input, 3);
-            ts_document_set_input(document, input->input());
-            edit_sequence();
-            assert_correct_tree_shape(document, entry.tree_string);
-            assert_correct_tree_size(document, input->content);
-            delete input;
-          });
-        };
-
-        it_handles_edit_sequence("initial parse", [&]() {
-          ts_document_parse(document);
-        });
-
-        std::set<std::pair<size_t, size_t>> deletions;
-        std::set<std::pair<size_t, string>> insertions;
-
-        for (size_t i = 0; i < 60; i++) {
-          size_t edit_position = random() % utf8_char_count(entry.input);
-          size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position);
-          string inserted_text = random_words(random() % 4 + 1);
-
-          if (insertions.insert({edit_position, inserted_text}).second) {
-            string description = "\"" + inserted_text + "\" at " + to_string(edit_position);
-
-            it_handles_edit_sequence("repairing an insertion of " + description, [&]() {
-              ts_document_edit(document, input->replace(edit_position, 0, inserted_text));
-              ts_document_parse(document);
-              assert_correct_tree_size(document, input->content);
-
-              ts_document_edit(document, input->undo());
-              assert_correct_tree_size(document, input->content);
-
-              TSRange *ranges;
-              uint32_t range_count;
-              ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
-              ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
-
-              ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
-              verify_changed_ranges(old_scope_sequence, new_scope_sequence,
-                                    input->content, ranges, range_count);
-              ts_free(ranges);
-            });
-          }
-
-          if (deletions.insert({edit_position, deletion_size}).second) {
-            string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size);
-
-            it_handles_edit_sequence("repairing a deletion of " + desription, [&]() {
-              ts_document_edit(document, input->replace(edit_position, deletion_size, ""));
-              ts_document_parse(document);
-              assert_correct_tree_size(document, input->content);
-
-              ts_document_edit(document, input->undo());
-              assert_correct_tree_size(document, input->content);
-
-              TSRange *ranges;
-              uint32_t range_count;
-              ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
-              ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
-
-              ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
-              verify_changed_ranges(old_scope_sequence, new_scope_sequence,
-                                    input->content, ranges, range_count);
-              ts_free(ranges);
-            });
-          }
-        }
-      }
-    });
-  }
-});
-
-END_TEST
--- a/spec/integration/real_grammars.cc
+++ b/spec/integration/real_grammars.cc
@ -0,0 +1,181 @@
+#include "spec_helper.h"
+#include "runtime/alloc.h"
+#include "helpers/load_language.h"
+#include "helpers/read_test_entries.h"
+#include "helpers/spy_input.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/point_helpers.h"
+#include "helpers/encoding_helpers.h"
+#include "helpers/record_alloc.h"
+#include "helpers/random_helpers.h"
+#include "helpers/scope_sequence.h"
+#include <set>
+
+static void assert_consistent_sizes(TSNode node) {
+  size_t child_count = ts_node_child_count(node);
+  size_t start_byte = ts_node_start_byte(node);
+  size_t end_byte = ts_node_end_byte(node);
+  TSPoint start_point = ts_node_start_point(node);
+  TSPoint end_point = ts_node_end_point(node);
+  bool some_child_has_changes = false;
+
+  AssertThat(start_byte, !IsGreaterThan(end_byte));
+  AssertThat(start_point, !IsGreaterThan(end_point));
+
+  size_t last_child_end_byte = start_byte;
+  TSPoint last_child_end_point = start_point;
+
+  for (size_t i = 0; i < child_count; i++) {
+    TSNode child = ts_node_child(node, i);
+    size_t child_start_byte = ts_node_start_byte(child);
+    TSPoint child_start_point = ts_node_start_point(child);
+
+    AssertThat(child_start_byte, !IsLessThan(last_child_end_byte));
+    AssertThat(child_start_point, !IsLessThan(last_child_end_point));
+    assert_consistent_sizes(child);
+    if (ts_node_has_changes(child))
+      some_child_has_changes = true;
+
+    last_child_end_byte = ts_node_end_byte(child);
+    last_child_end_point = ts_node_end_point(child);
+  }
+
+  if (child_count > 0) {
+    AssertThat(end_byte, !IsLessThan(last_child_end_byte));
+    AssertThat(end_point, !IsLessThan(last_child_end_point));
+  }
+
+  if (some_child_has_changes) {
+    AssertThat(ts_node_has_changes(node), IsTrue());
+  }
+}
+
+static void assert_correct_tree_size(TSDocument *document, string content) {
+  TSNode root_node = ts_document_root_node(document);
+  size_t expected_size = content.size();
+
+  // In the JSON grammar, the start rule (`_value`) is hidden, so the node
+  // returned from `ts_document_root_node` (e.g. an `object` node), does not
+  // actually point to the root of the tree. In this weird case, trailing
+  // whitespace is not included in the root node's size.
+  //
+  // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden?
+  if (ts_document_language(document) == load_real_language("json") &&
+      string(ts_node_type(root_node, document)) != "ERROR")
+    expected_size = content.find_last_not_of("\n ") + 1;
+
+  AssertThat(ts_node_end_byte(root_node), Equals(expected_size));
+  assert_consistent_sizes(root_node);
+}
+
+START_TEST
+
+vector<string> test_languages({
+  "javascript",
+  "json",
+  "c",
+  "cpp",
+  "python",
+});
+
+for (auto &language_name : test_languages) {
+  describe(("the " + language_name + " language").c_str(), [&]() {
+    TSDocument *document;
+
+    before_each([&]() {
+      record_alloc::start();
+      document = ts_document_new();
+      ts_document_set_language(document, load_real_language(language_name));
+
+      // ts_document_set_logger(document, stderr_logger_new(true));
+      // ts_document_print_debugging_graphs(document, true);
+    });
+
+    after_each([&]() {
+      ts_document_free(document);
+      AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+    });
+
+    for (auto &entry : read_real_language_corpus(language_name)) {
+      SpyInput *input;
+
+      auto it_handles_edit_sequence = [&](string name, std::function<void()> edit_sequence){
+        it(("parses " + entry.description + ": " + name).c_str(), [&]() {
+          input = new SpyInput(entry.input, 3);
+          ts_document_set_input(document, input->input());
+          edit_sequence();
+
+          TSNode root_node = ts_document_root_node(document);
+          const char *node_string = ts_node_string(root_node, document);
+          string result(node_string);
+          ts_free((void *)node_string);
+          AssertThat(result, Equals(entry.tree_string));
+
+          assert_correct_tree_size(document, input->content);
+          delete input;
+        });
+      };
+
+      it_handles_edit_sequence("initial parse", [&]() {
+        ts_document_parse(document);
+      });
+
+      std::set<std::pair<size_t, size_t>> deletions;
+      std::set<std::pair<size_t, string>> insertions;
+
+      for (size_t i = 0; i < 60; i++) {
+        size_t edit_position = random() % utf8_char_count(entry.input);
+        size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position);
+        string inserted_text = random_words(random() % 4 + 1);
+
+        if (insertions.insert({edit_position, inserted_text}).second) {
+          string description = "\"" + inserted_text + "\" at " + to_string(edit_position);
+
+          it_handles_edit_sequence("repairing an insertion of " + description, [&]() {
+            ts_document_edit(document, input->replace(edit_position, 0, inserted_text));
+            ts_document_parse(document);
+            assert_correct_tree_size(document, input->content);
+
+            ts_document_edit(document, input->undo());
+            assert_correct_tree_size(document, input->content);
+
+            TSRange *ranges;
+            uint32_t range_count;
+            ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
+            ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
+
+            ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
+            verify_changed_ranges(old_scope_sequence, new_scope_sequence,
+                                  input->content, ranges, range_count);
+            ts_free(ranges);
+          });
+        }
+
+        if (deletions.insert({edit_position, deletion_size}).second) {
+          string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size);
+
+          it_handles_edit_sequence("repairing a deletion of " + desription, [&]() {
+            ts_document_edit(document, input->replace(edit_position, deletion_size, ""));
+            ts_document_parse(document);
+            assert_correct_tree_size(document, input->content);
+
+            ts_document_edit(document, input->undo());
+            assert_correct_tree_size(document, input->content);
+
+            TSRange *ranges;
+            uint32_t range_count;
+            ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
+            ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
+
+            ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
+            verify_changed_ranges(old_scope_sequence, new_scope_sequence,
+                                  input->content, ranges, range_count);
+            ts_free(ranges);
+          });
+        }
+      }
+    }
+  });
+}
+
+END_TEST
--- a/spec/integration/test_grammars.cc
+++ b/spec/integration/test_grammars.cc
@ -0,0 +1,78 @@
+#include "spec_helper.h"
+#include "helpers/read_test_entries.h"
+#include "helpers/load_language.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/file_helpers.h"
+#include "runtime/alloc.h"
+
+START_TEST
+
+string grammars_dir_path = "spec/fixtures/test_grammars";
+vector<string> test_languages = list_directory(grammars_dir_path);
+
+for (auto &language_name : test_languages) {
+  if (language_name == "readme.md") continue;
+
+  describe(("test language: " + language_name).c_str(), [&]() {
+    string directory_path = grammars_dir_path + "/" + language_name;
+    string grammar_path = directory_path + "/grammar.json";
+    string external_scanner_path = directory_path + "/scanner.c";
+    string expected_error_path = directory_path + "/expected_error.txt";
+    string corpus_path = directory_path + "/corpus.txt";
+
+    if (!file_exists(external_scanner_path)) {
+      external_scanner_path = "";
+    }
+
+    string grammar_json = read_file(grammar_path);
+    TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str());
+
+    if (file_exists(expected_error_path)) {
+      it("fails with the correct error message", [&]() {
+        string expected_error = read_file(expected_error_path);
+        AssertThat((void *)compile_result.error_message, !IsNull());
+        AssertThat(compile_result.error_message, Equals(expected_error));
+      });
+
+      return;
+    } else {
+      TSDocument *document = nullptr;
+      const TSLanguage *language = nullptr;
+
+      before_each([&]() {
+        if (!language) {
+          language = load_test_language(
+            language_name,
+            compile_result,
+            external_scanner_path
+          );
+        }
+
+        document = ts_document_new();
+        ts_document_set_language(document, language);
+
+        // ts_document_set_logger(document, stderr_logger_new(true));
+        // ts_document_print_debugging_graphs(document, true);
+      });
+
+      after_each([&]() {
+        if (document) ts_document_free(document);
+      });
+
+      for (auto &entry : read_test_language_corpus(language_name)) {
+        it(("parses " + entry.description).c_str(), [&]() {
+          ts_document_set_input_string_with_length(document, entry.input.c_str(), entry.input.size());
+          ts_document_parse(document);
+
+          TSNode root_node = ts_document_root_node(document);
+          const char *node_string = ts_node_string(root_node, document);
+          string result(node_string);
+          ts_free((void *)node_string);
+          AssertThat(result, Equals(entry.tree_string));
+        });
+      }
+    }
+  });
+}
+
+END_TEST
--- a/spec/runtime/document_spec.cc
+++ b/spec/runtime/document_spec.cc
@ -43,7 +43,7 @@ describe("Document", [&]() {
    before_each([&]() {
      spy_input = new SpyInput("{\"key\": [null, 2]}", 3);

-      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_language(document, load_real_language("json"));
      ts_document_set_input_string(document, "{\"key\": [1, 2]}");
      ts_document_parse(document);

@ -152,7 +152,7 @@ describe("Document", [&]() {
    });

    it("uses the given language for future parses", [&]() {
-      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_language(document, load_real_language("json"));
      ts_document_parse(document);

      root = ts_document_root_node(document);
@ -162,10 +162,10 @@ describe("Document", [&]() {
    });

    it("clears out any previous tree", [&]() {
-      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_language(document, load_real_language("json"));
      ts_document_parse(document);

-      ts_document_set_language(document, get_test_language("javascript"));
+      ts_document_set_language(document, load_real_language("javascript"));
      AssertThat(ts_document_root_node(document).data, Equals<void *>(nullptr));

      ts_document_parse(document);
@ -177,7 +177,7 @@ describe("Document", [&]() {
    });

    it("does not allow setting a language with a different version number", [&]() {
-      TSLanguage language = *get_test_language("json");
+      TSLanguage language = *load_real_language("json");
      AssertThat(ts_language_version(&language), Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));

      language.version++;
@ -193,7 +193,7 @@ describe("Document", [&]() {

    before_each([&]() {
      logger = new SpyLogger();
-      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_language(document, load_real_language("json"));
      ts_document_set_input_string(document, "[1, 2]");
    });

@ -235,7 +235,7 @@ describe("Document", [&]() {
    SpyInput *input;

    before_each([&]() {
-      ts_document_set_language(document, get_test_language("javascript"));
+      ts_document_set_language(document, load_real_language("javascript"));
      input = new SpyInput("{a: null};", 3);
      ts_document_set_input(document, input->input());
      ts_document_parse(document);
--- a/spec/runtime/node_spec.cc
+++ b/spec/runtime/node_spec.cc
@ -40,7 +40,7 @@ describe("Node", []() {
    record_alloc::start();

    document = ts_document_new();
-    ts_document_set_language(document, get_test_language("json"));
+    ts_document_set_language(document, load_real_language("json"));
    ts_document_set_input_string(document, input_string.c_str());
    ts_document_parse(document);

--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@ -83,7 +83,7 @@ describe("Parser", [&]() {
  describe("handling errors", [&]() {
    describe("when there is an invalid substring right before a valid token", [&]() {
      it("computes the error node's size and position correctly", [&]() {
-        ts_document_set_language(document, get_test_language("json"));
+        ts_document_set_language(document, load_real_language("json"));
        set_text("  [123,  @@@@@,   true]");

        assert_root_node(
@ -108,7 +108,7 @@ describe("Parser", [&]() {

    describe("when there is an unexpected string in the middle of a token", [&]() {
      it("computes the error node's size and position correctly", [&]() {
-        ts_document_set_language(document, get_test_language("json"));
+        ts_document_set_language(document, load_real_language("json"));
        set_text("  [123, faaaaalse, true]");

        assert_root_node(
@ -134,7 +134,7 @@ describe("Parser", [&]() {

    describe("when there is one unexpected token between two valid tokens", [&]() {
      it("computes the error node's size and position correctly", [&]() {
-        ts_document_set_language(document, get_test_language("json"));
+        ts_document_set_language(document, load_real_language("json"));
        set_text("  [123, true false, true]");

        assert_root_node(
@ -153,7 +153,7 @@ describe("Parser", [&]() {

    describe("when there is an unexpected string at the end of a token", [&]() {
      it("computes the error's size and position correctly", [&]() {
-        ts_document_set_language(document, get_test_language("json"));
+        ts_document_set_language(document, load_real_language("json"));
        set_text("  [123, \"hi\n, true]");

        assert_root_node(
@ -163,7 +163,7 @@ describe("Parser", [&]() {

    describe("when there is an unterminated error", [&]() {
      it("maintains a consistent tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("a; /* b");
        assert_root_node(
          "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
@ -172,7 +172,7 @@ describe("Parser", [&]() {

    describe("when there are extra tokens at the end of the viable prefix", [&]() {
      it("does not include them in the error node", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text(
          "var x;\n"
          "\n"
@ -192,7 +192,7 @@ describe("Parser", [&]() {
  describe("handling extra tokens", [&]() {
    describe("when the token appears as part of a grammar rule", [&]() {
      it("incorporates it into the tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("fn()\n");

        assert_root_node(
@ -202,7 +202,7 @@ describe("Parser", [&]() {

    describe("when the token appears somewhere else", [&]() {
      it("incorporates it into the tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text(
          "fn()\n"
          "  .otherFn();");
@ -218,7 +218,7 @@ describe("Parser", [&]() {

    describe("when several extra tokens appear in a row", [&]() {
      it("incorporates them into the tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text(
          "fn()\n\n"
          "// This is a comment"
@ -239,7 +239,7 @@ describe("Parser", [&]() {
  describe("editing", [&]() {
    describe("creating new tokens near the end of the input", [&]() {
      it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("x * (100 + abc);");

        assert_root_node(
@ -262,7 +262,7 @@ describe("Parser", [&]() {
      it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
        chunk_size = 2;

-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("123 + 456 * (10 + x);");

        assert_root_node(
@ -285,7 +285,7 @@ describe("Parser", [&]() {

    describe("introducing an error", [&]() {
      it("gives the error the right size", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("var x = y;");

        assert_root_node(
@ -308,7 +308,7 @@ describe("Parser", [&]() {

    describe("into the middle of an existing token", [&]() {
      it("updates the parse tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("abc * 123;");

        assert_root_node(
@ -327,7 +327,7 @@ describe("Parser", [&]() {

    describe("at the end of an existing token", [&]() {
      it("updates the parse tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("abc * 123;");

        assert_root_node(
@ -346,7 +346,7 @@ describe("Parser", [&]() {

    describe("inserting text into a node containing a extra token", [&]() {
      it("updates the parse tree", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("123 *\n"
          "// a-comment\n"
          "abc;");
@ -373,7 +373,7 @@ describe("Parser", [&]() {

    describe("when a critical token is removed", [&]() {
      it("updates the parse tree, creating an error", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("123 * 456; 789 * 123;");

        assert_root_node(
@ -392,7 +392,7 @@ describe("Parser", [&]() {

    describe("with external tokens", [&]() {
      it("maintains the external scanner's state during incremental parsing", [&]() {
-        ts_document_set_language(document, get_test_language("python"));
+        ts_document_set_language(document, load_real_language("python"));
        string text = dedent(R"PYTHON(
          if a:
              print b
@ -420,7 +420,7 @@ describe("Parser", [&]() {
    });

    it("does not try to re-use nodes that are within the edited region", [&]() {
-      ts_document_set_language(document, get_test_language("javascript"));
+      ts_document_set_language(document, load_real_language("javascript"));
      set_text("{ x: (b.c) };");

      assert_root_node(
@ -435,7 +435,7 @@ describe("Parser", [&]() {
    });

    it("updates the document's parse count", [&]() {
-      ts_document_set_language(document, get_test_language("javascript"));
+      ts_document_set_language(document, load_real_language("javascript"));
      AssertThat(ts_document_parse_count(document), Equals<size_t>(0));

      set_text("{ x: (b.c) };");
@ -449,7 +449,7 @@ describe("Parser", [&]() {
  describe("lexing", [&]() {
    describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
      it("terminates them at the end of the document", [&]() {
-        ts_document_set_language(document, get_test_language("javascript"));
+        ts_document_set_language(document, load_real_language("javascript"));
        set_text("x; // this is a comment");

        assert_root_node(
@ -464,7 +464,7 @@ describe("Parser", [&]() {

    it("recognizes UTF8 characters as single characters", [&]() {
      // 'ΩΩΩ — ΔΔ';
-      ts_document_set_language(document, get_test_language("javascript"));
+      ts_document_set_language(document, load_real_language("javascript"));
      set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';");

      assert_root_node(
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -1,195 +0,0 @@
-#include "compiler/build_tables/build_lex_table.h"
-#include <climits>
-#include <map>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "compiler/build_tables/lex_conflict_manager.h"
-#include "compiler/build_tables/remove_duplicate_states.h"
-#include "compiler/build_tables/lex_item.h"
-#include "compiler/parse_table.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/rules/built_in_symbols.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/metadata.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/blank.h"
-
-namespace tree_sitter {
-namespace build_tables {
-
-using std::map;
-using std::set;
-using std::string;
-using std::vector;
-using std::make_shared;
-using std::unordered_map;
-using rules::Blank;
-using rules::Choice;
-using rules::CharacterSet;
-using rules::Repeat;
-using rules::Symbol;
-using rules::Metadata;
-using rules::Seq;
-
-class LexTableBuilder {
-  LexTable lex_table;
-  ParseTable *parse_table;
-  const LexicalGrammar lex_grammar;
-  vector<rule_ptr> separator_rules;
-  LexConflictManager conflict_manager;
-  unordered_map<LexItemSet, LexStateId> lex_state_ids;
-
- public:
-  LexTableBuilder(ParseTable *parse_table, const LexicalGrammar &lex_grammar)
-      : parse_table(parse_table), lex_grammar(lex_grammar) {
-    for (const rule_ptr &rule : lex_grammar.separators)
-      separator_rules.push_back(Repeat::build(rule));
-    separator_rules.push_back(Blank::build());
-  }
-
-  LexTable build() {
-    for (ParseState &parse_state : parse_table->states)
-      add_lex_state_for_parse_state(&parse_state);
-
-    mark_fragile_tokens();
-    remove_duplicate_lex_states();
-
-    return lex_table;
-  }
-
- private:
-  void add_lex_state_for_parse_state(ParseState *parse_state) {
-    parse_state->lex_state_id =
-      add_lex_state(item_set_for_terminals(parse_state->terminal_entries));
-  }
-
-  LexStateId add_lex_state(const LexItemSet &item_set) {
-    const auto &pair = lex_state_ids.find(item_set);
-    if (pair == lex_state_ids.end()) {
-      LexStateId state_id = lex_table.add_state();
-      lex_state_ids[item_set] = state_id;
-      add_accept_token_actions(item_set, state_id);
-      add_advance_actions(item_set, state_id);
-      return state_id;
-    } else {
-      return pair->second;
-    }
-  }
-
-  void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
-    for (const auto &pair : item_set.transitions()) {
-      const CharacterSet &characters = pair.first;
-      const LexItemSet::Transition &transition = pair.second;
-      AdvanceAction action(-1, transition.precedence, transition.in_main_token);
-
-      auto current_action = lex_table.state(state_id).accept_action;
-      if (conflict_manager.resolve(transition.destination, action,
-                                   current_action)) {
-        action.state_index = add_lex_state(transition.destination);
-        lex_table.state(state_id).advance_actions[characters] = action;
-      }
-    }
-  }
-
-  void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) {
-    for (const LexItem &item : item_set.entries) {
-      LexItem::CompletionStatus completion_status = item.completion_status();
-      if (completion_status.is_done) {
-        AcceptTokenAction action(item.lhs, completion_status.precedence.max,
-                                 completion_status.is_string);
-
-        auto current_action = lex_table.state(state_id).accept_action;
-        if (conflict_manager.resolve(action, current_action))
-          lex_table.state(state_id).accept_action = action;
-      }
-    }
-  }
-
-  void mark_fragile_tokens() {
-    for (ParseState &state : parse_table->states) {
-      for (auto &entry : state.terminal_entries) {
-        Symbol symbol = entry.first;
-        if (symbol.is_token()) {
-          auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
-          if (homonyms != conflict_manager.possible_homonyms.end())
-            for (Symbol::Index homonym : homonyms->second)
-              if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
-                entry.second.reusable = false;
-                break;
-              }
-
-          if (!entry.second.reusable)
-            continue;
-
-          auto extensions = conflict_manager.possible_extensions.find(symbol.index);
-          if (extensions != conflict_manager.possible_extensions.end())
-            for (Symbol::Index extension : extensions->second)
-              if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
-                entry.second.depends_on_lookahead = true;
-                break;
-              }
-        }
-      }
-    }
-  }
-
-  void remove_duplicate_lex_states() {
-    for (LexState &state : lex_table.states) {
-      state.accept_action.is_string = false;
-      state.accept_action.precedence = 0;
-    }
-
-    auto replacements =
-      remove_duplicate_states<LexTable>(&lex_table);
-
-    for (ParseState &parse_state : parse_table->states) {
-      auto replacement = replacements.find(parse_state.lex_state_id);
-      if (replacement != replacements.end())
-        parse_state.lex_state_id = replacement->second;
-    }
-  }
-
-  LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
-    LexItemSet result;
-    for (const auto &pair : terminals) {
-      Symbol symbol = pair.first;
-      if (symbol.is_token()) {
-        for (const rule_ptr &rule : rules_for_symbol(symbol)) {
-          for (const rule_ptr &separator_rule : separator_rules) {
-            result.entries.insert(LexItem(
-              symbol,
-              Metadata::separator(
-                Seq::build({
-                  separator_rule,
-                  Metadata::main_token(rule) }))));
-          }
-        }
-      }
-    }
-    return result;
-  }
-
-  vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
-    if (symbol == rules::END_OF_INPUT())
-      return { CharacterSet().include(0).copy() };
-
-    rule_ptr rule = lex_grammar.variables[symbol.index].rule;
-
-    auto choice = rule->as<Choice>();
-    if (choice)
-      return choice->elements;
-    else
-      return { rule };
-  }
-};
-
-LexTable build_lex_table(ParseTable *table, const LexicalGrammar &grammar) {
-  return LexTableBuilder(table, grammar).build();
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
--- a/src/compiler/build_tables/build_lex_table.h
+++ b/src/compiler/build_tables/build_lex_table.h
@ -1,18 +0,0 @@
-#ifndef COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
-#define COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
-
-#include "compiler/lex_table.h"
-
-namespace tree_sitter {
-
-struct LexicalGrammar;
-class ParseTable;
-
-namespace build_tables {
-
-LexTable build_lex_table(ParseTable *, const LexicalGrammar &);
-
-}  // namespace build_tables
-}  // namespace tree_sitter
-
-#endif  // COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -6,14 +6,13 @@
 #include <unordered_map>
 #include <utility>
 #include "compiler/parse_table.h"
-#include "compiler/build_tables/remove_duplicate_states.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/parse_item_set_builder.h"
 #include "compiler/lexical_grammar.h"
 #include "compiler/syntax_grammar.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/built_in_symbols.h"
-#include "compiler/build_tables/recovery_tokens.h"
+#include "compiler/build_tables/lex_table_builder.h"

 namespace tree_sitter {
 namespace build_tables {
@ -41,6 +40,7 @@ class ParseTableBuilder {
  set<string> conflicts;
  ParseItemSetBuilder item_set_builder;
  set<const Production *> fragile_productions;
+  vector<set<Symbol::Index>> incompatible_token_indices_by_index;
  bool allow_any_conflict;

 public:
@ -56,9 +56,9 @@ class ParseTableBuilder {
      Symbol(0, Symbol::Terminal) :
      Symbol(0, Symbol::NonTerminal);

-    Production start_production({
-      ProductionStep(start_symbol, 0, rules::AssociativityNone),
-    });
+    Production start_production{
+      ProductionStep{start_symbol, 0, rules::AssociativityNone},
+    };

    // Placeholder for error state
    add_parse_state(ParseItemSet());
@ -71,10 +71,11 @@ class ParseTableBuilder {
    }));

    CompileError error = process_part_state_queue();
-    if (error.type != TSCompileErrorTypeNone)
+    if (error.type != TSCompileErrorTypeNone) {
      return { parse_table, error };
+    }

-    parse_table.mergeable_symbols = recovery_tokens(lexical_grammar);
+    compute_unmergable_token_pairs();

    build_error_parse_state();

@ -110,8 +111,18 @@ class ParseTableBuilder {
  void build_error_parse_state() {
    ParseState error_state;

-    for (const Symbol symbol : parse_table.mergeable_symbols) {
-      add_out_of_context_parse_state(&error_state, symbol);
+    for (Symbol::Index i = 0; i < lexical_grammar.variables.size(); i++) {
+      bool has_non_reciprocal_conflict = false;
+      for (Symbol::Index incompatible_index : incompatible_token_indices_by_index[i]) {
+        if (!incompatible_token_indices_by_index[incompatible_index].count(i)) {
+          has_non_reciprocal_conflict = true;
+          break;
+        }
+      }
+
+      if (!has_non_reciprocal_conflict) {
+        add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::Terminal));
+      }
    }

    for (const Symbol &symbol : grammar.extra_tokens) {
@ -148,7 +159,8 @@ class ParseTableBuilder {
  ParseStateId add_parse_state(const ParseItemSet &item_set) {
    auto pair = parse_state_ids.find(item_set);
    if (pair == parse_state_ids.end()) {
-      ParseStateId state_id = parse_table.add_state();
+      ParseStateId state_id = parse_table.states.size();
+      parse_table.states.push_back(ParseState());
      parse_state_ids[item_set] = state_id;
      parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature();
      item_sets_to_process.push_back({ std::move(item_set), state_id });
@ -291,6 +303,34 @@ class ParseTableBuilder {
    }
  }

+  void compute_unmergable_token_pairs() {
+    incompatible_token_indices_by_index.resize(lexical_grammar.variables.size());
+
+    // First, assume that all tokens are mutually incompatible.
+    for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
+      auto &incompatible_indices = incompatible_token_indices_by_index[i];
+      for (Symbol::Index j = 0; j < n; j++) {
+        if (j != i) incompatible_indices.insert(j);
+      }
+    }
+
+    // For the remaining possibly-incompatible pairs of tokens, check if they
+    // are actually incompatible by actually generating lexical states that
+    // contain them both.
+    auto lex_table_builder = LexTableBuilder::create(lexical_grammar);
+    for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
+      auto &incompatible_indices = incompatible_token_indices_by_index[i];
+      auto iter = incompatible_indices.begin();
+      while (iter != incompatible_indices.end()) {
+        if (lex_table_builder->detect_conflict(i, *iter)) {
+          ++iter;
+        } else {
+          iter = incompatible_indices.erase(iter);
+        }
+      }
+    }
+  }
+
  void remove_duplicate_parse_states() {
    map<size_t, set<ParseStateId>> state_indices_by_signature;

@ -302,7 +342,7 @@ class ParseTableBuilder {
    set<ParseStateId> deleted_states;

    while (true) {
-      std::map<ParseStateId, ParseStateId> state_replacements;
+      map<ParseStateId, ParseStateId> state_replacements;

      for (auto &pair : state_indices_by_signature) {
        auto &state_group = pair.second;
@ -310,7 +350,7 @@ class ParseTableBuilder {
        for (ParseStateId i : state_group) {
          for (ParseStateId j : state_group) {
            if (j == i) break;
-            if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
+            if (!state_replacements.count(j) && merge_parse_state(j, i)) {
              state_replacements.insert({ i, j });
              deleted_states.insert(i);
              break;
@ -364,6 +404,72 @@ class ParseTableBuilder {
    }
  }

+  static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
+    for (const auto &pair : state.terminal_entries)
+      if (pair.second == entry)
+        return true;
+    return false;
+  }
+
+  bool merge_parse_state(size_t i, size_t j) {
+    ParseState &state = parse_table.states[i];
+    ParseState &other = parse_table.states[j];
+
+    if (state.nonterminal_entries != other.nonterminal_entries)
+      return false;
+
+    for (auto &entry : state.terminal_entries) {
+      Symbol lookahead = entry.first;
+      const vector<ParseAction> &actions = entry.second.actions;
+      auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
+
+      const auto &other_entry = other.terminal_entries.find(lookahead);
+      if (other_entry == other.terminal_entries.end()) {
+        if (lookahead.is_external()) return false;
+        if (!lookahead.is_built_in()) {
+          for (Symbol::Index incompatible_index : incompatible_token_indices) {
+            Symbol incompatible_symbol(incompatible_index, Symbol::Terminal);
+            if (other.terminal_entries.count(incompatible_symbol)) return false;
+          }
+        }
+        if (actions.back().type != ParseActionTypeReduce)
+          return false;
+        if (!has_entry(other, entry.second))
+          return false;
+      } else if (entry.second != other_entry->second) {
+        return false;
+      }
+    }
+
+    set<Symbol> symbols_to_merge;
+
+    for (auto &entry : other.terminal_entries) {
+      Symbol lookahead = entry.first;
+      const vector<ParseAction> &actions = entry.second.actions;
+      auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
+
+      if (!state.terminal_entries.count(lookahead)) {
+        if (lookahead.is_external()) return false;
+        if (!lookahead.is_built_in()) {
+          for (Symbol::Index incompatible_index : incompatible_token_indices) {
+            Symbol incompatible_symbol(incompatible_index, Symbol::Terminal);
+            if (state.terminal_entries.count(incompatible_symbol)) return false;
+          }
+        }
+        if (actions.back().type != ParseActionTypeReduce)
+          return false;
+        if (!has_entry(state, entry.second))
+          return false;
+        symbols_to_merge.insert(lookahead);
+      }
+    }
+
+    for (const Symbol &lookahead : symbols_to_merge)
+      state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
+
+    return true;
+  }
+
  string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
                         Symbol lookahead) {
    ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
@ -574,7 +680,7 @@ class ParseTableBuilder {

    switch (symbol.type) {
      case Symbol::Terminal: {
-        const Variable &variable = lexical_grammar.variables[symbol.index];
+        const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
        if (variable.type == VariableTypeNamed)
          return variable.name;
        else
--- a/src/compiler/build_tables/build_tables.cc
+++ b/src/compiler/build_tables/build_tables.cc
@ -1,6 +1,6 @@
 #include "compiler/build_tables/build_tables.h"
 #include <tuple>
-#include "compiler/build_tables/build_lex_table.h"
+#include "compiler/build_tables/lex_table_builder.h"
 #include "compiler/build_tables/build_parse_table.h"
 #include "compiler/syntax_grammar.h"
 #include "compiler/lexical_grammar.h"
@ -15,11 +15,13 @@ using std::vector;
 using std::make_tuple;

 tuple<ParseTable, LexTable, CompileError> build_tables(
-  const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) {
-  auto parse_table_result = build_parse_table(grammar, lex_grammar);
+  const SyntaxGrammar &grammar,
+  const LexicalGrammar &lexical_grammar
+) {
+  auto parse_table_result = build_parse_table(grammar, lexical_grammar);
  ParseTable parse_table = parse_table_result.first;
  const CompileError error = parse_table_result.second;
-  LexTable lex_table = build_lex_table(&parse_table, lex_grammar);
+  LexTable lex_table = LexTableBuilder::create(lexical_grammar)->build(&parse_table);
  return make_tuple(parse_table, lex_table, error);
 }

--- a/src/compiler/build_tables/lex_conflict_manager.cc
+++ b/src/compiler/build_tables/lex_conflict_manager.cc
@ -10,11 +10,10 @@ namespace build_tables {
 bool LexConflictManager::resolve(const LexItemSet &item_set,
                                 const AdvanceAction &new_action,
                                 const AcceptTokenAction &old_action) {
-  if (!old_action.is_present())
-    return true;
  if (new_action.precedence_range.max >= old_action.precedence) {
-    for (const LexItem &item : item_set.entries)
+    for (const LexItem &item : item_set.entries) {
      possible_extensions[old_action.symbol.index].insert(item.lhs.index);
+    }
    return true;
  } else {
    return false;
@ -23,30 +22,26 @@ bool LexConflictManager::resolve(const LexItemSet &item_set,

 bool LexConflictManager::resolve(const AcceptTokenAction &new_action,
                                 const AcceptTokenAction &old_action) {
-  if (!old_action.is_present())
-    return true;
-
-  int old_precedence = old_action.precedence;
-  int new_precedence = new_action.precedence;
-
  bool result;
-  if (new_precedence > old_precedence)
+  if (new_action.precedence > old_action.precedence) {
    result = true;
-  else if (new_precedence < old_precedence)
+  } else if (new_action.precedence < old_action.precedence) {
    result = false;
-  else if (new_action.is_string && !old_action.is_string)
+  } else if (new_action.is_string && !old_action.is_string) {
    result = true;
-  else if (old_action.is_string && !new_action.is_string)
+  } else if (old_action.is_string && !new_action.is_string) {
    result = false;
-  else if (new_action.symbol.index < old_action.symbol.index)
+  } else if (new_action.symbol.index < old_action.symbol.index) {
    result = true;
-  else
+  } else {
    result = false;
+  }

-  if (result)
+  if (result) {
    possible_homonyms[old_action.symbol.index].insert(new_action.symbol.index);
-  else
+  } else {
    possible_homonyms[new_action.symbol.index].insert(old_action.symbol.index);
+  }

  return result;
 }
--- a/src/compiler/build_tables/lex_item.cc
+++ b/src/compiler/build_tables/lex_item.cc
@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const {
    CompletionStatus apply_to(const rules::Choice *rule) {
      for (const auto &element : rule->elements) {
        CompletionStatus status = apply(element);
-        if (status.is_done)
-          return status;
+        if (status.is_done) return status;
      }
-      return { false, PrecedenceRange(), false };
+      return { false, PrecedenceRange() };
    }

    CompletionStatus apply_to(const rules::Metadata *rule) {
      CompletionStatus result = apply(rule->rule);
-      if (result.is_done) {
-        if (result.precedence.empty && rule->params.has_precedence)
-          result.precedence.add(rule->params.precedence);
-        if (rule->params.is_string)
-          result.is_string = true;
+      if (result.is_done && result.precedence.empty && rule->params.has_precedence) {
+        result.precedence.add(rule->params.precedence);
      }
      return result;
    }
@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const {
    }

    CompletionStatus apply_to(const rules::Blank *rule) {
-      return { true, PrecedenceRange(), false };
+      return { true, PrecedenceRange() };
    }

    CompletionStatus apply_to(const rules::Seq *rule) {
      CompletionStatus left_status = apply(rule->left);
-      if (left_status.is_done)
+      if (left_status.is_done) {
        return apply(rule->right);
-      else
-        return { false, PrecedenceRange(), false };
+      } else {
+        return { false, PrecedenceRange() };
+      }
    }
  };

@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const {

 LexItemSet::TransitionMap LexItemSet::transitions() const {
  TransitionMap result;
-  for (const LexItem &item : entries)
+  for (const LexItem &item : entries) {
    lex_item_transitions(&result, item);
+  }
  return result;
 }

--- a/src/compiler/build_tables/lex_item.h
+++ b/src/compiler/build_tables/lex_item.h
@ -19,7 +19,6 @@ class LexItem {
  struct CompletionStatus {
    bool is_done;
    PrecedenceRange precedence;
-    bool is_string;
  };

  bool operator==(const LexItem &other) const;
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@ -0,0 +1,324 @@
+#include "compiler/build_tables/lex_table_builder.h"
+#include <climits>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "compiler/build_tables/lex_conflict_manager.h"
+#include "compiler/build_tables/lex_item.h"
+#include "compiler/parse_table.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "compiler/rules/choice.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/blank.h"
+#include "compiler/rules/visitor.h"
+
+namespace tree_sitter {
+namespace build_tables {
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+using std::unordered_map;
+using std::unique_ptr;
+using rules::Blank;
+using rules::Choice;
+using rules::CharacterSet;
+using rules::Repeat;
+using rules::Symbol;
+using rules::Metadata;
+using rules::Seq;
+
+class StartingCharacterAggregator : public rules::RuleFn<void> {
+  void apply_to(const rules::Seq *rule) {
+    apply(rule->left);
+  }
+
+  void apply_to(const rules::Choice *rule) {
+    for (const rule_ptr &element : rule->elements) apply(element);
+  }
+
+  void apply_to(const rules::Repeat *rule) {
+    apply(rule->content);
+  }
+
+  void apply_to(const rules::Metadata *rule) {
+    apply(rule->rule);
+  }
+
+  void apply_to(const rules::CharacterSet *rule) {
+    result.add_set(*rule);
+  }
+
+ public:
+  CharacterSet result;
+};
+
+class LexTableBuilderImpl : public LexTableBuilder {
+  LexTable lex_table;
+  const LexicalGrammar grammar;
+  vector<rule_ptr> separator_rules;
+  CharacterSet first_separator_characters;
+  LexConflictManager conflict_manager;
+  unordered_map<LexItemSet, LexStateId> lex_state_ids;
+
+ public:
+  vector<bool> shadowed_token_indices;
+
+  LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) {
+    StartingCharacterAggregator starting_character_aggregator;
+    for (const rule_ptr &rule : grammar.separators) {
+      separator_rules.push_back(Repeat::build(rule));
+      starting_character_aggregator.apply(rule);
+    }
+    separator_rules.push_back(Blank::build());
+    first_separator_characters = starting_character_aggregator.result;
+    shadowed_token_indices.resize(grammar.variables.size());
+  }
+
+  LexTable build(ParseTable *parse_table) {
+    for (ParseState &parse_state : parse_table->states) {
+      parse_state.lex_state_id = add_lex_state(
+        item_set_for_terminals(parse_state.terminal_entries)
+      );
+    }
+    mark_fragile_tokens(parse_table);
+    remove_duplicate_lex_states(parse_table);
+    return lex_table;
+  }
+
+  bool detect_conflict(Symbol::Index left, Symbol::Index right) {
+    clear();
+
+    map<Symbol, ParseTableEntry> terminals;
+    terminals[Symbol(left, Symbol::Terminal)];
+    terminals[Symbol(right, Symbol::Terminal)];
+
+    add_lex_state(item_set_for_terminals(terminals));
+
+    return shadowed_token_indices[right];
+  }
+
+  LexStateId add_lex_state(const LexItemSet &item_set) {
+    const auto &pair = lex_state_ids.find(item_set);
+    if (pair == lex_state_ids.end()) {
+      LexStateId state_id = lex_table.states.size();
+      lex_table.states.push_back(LexState());
+      lex_state_ids[item_set] = state_id;
+      add_accept_token_actions(item_set, state_id);
+      add_advance_actions(item_set, state_id);
+      return state_id;
+    } else {
+      return pair->second;
+    }
+  }
+
+  void clear() {
+    lex_table.states.clear();
+    lex_state_ids.clear();
+    shadowed_token_indices.assign(grammar.variables.size(), false);
+  }
+
+ private:
+  void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
+    for (const auto &pair : item_set.transitions()) {
+      const CharacterSet &characters = pair.first;
+      const LexItemSet::Transition &transition = pair.second;
+
+      AdvanceAction action(-1, transition.precedence, transition.in_main_token);
+      auto current_action = lex_table.states[state_id].accept_action;
+      if (current_action.is_present()) {
+        bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action);
+        bool matches_accepted_token = false;
+        for (const LexItem &item : transition.destination.entries) {
+          if (item.lhs == current_action.symbol) {
+            matches_accepted_token = true;
+          } else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) {
+            shadowed_token_indices[item.lhs.index] = true;
+          }
+        }
+
+        if (!matches_accepted_token && characters.intersects(first_separator_characters)) {
+          shadowed_token_indices[current_action.symbol.index] = true;
+        }
+
+        if (!prefer_advancing) {
+          continue;
+        }
+      }
+
+      action.state_index = add_lex_state(transition.destination);
+      lex_table.states[state_id].advance_actions[characters] = action;
+    }
+  }
+
+  void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) {
+    for (const LexItem &item : item_set.entries) {
+      LexItem::CompletionStatus completion_status = item.completion_status();
+      if (completion_status.is_done) {
+        AcceptTokenAction action(item.lhs, completion_status.precedence.max,
+                                 item.lhs.is_built_in() ||
+                                 grammar.variables[item.lhs.index].is_string);
+
+        auto current_action = lex_table.states[state_id].accept_action;
+        if (current_action.is_present()) {
+          if (!conflict_manager.resolve(action, current_action)) {
+            continue;
+          }
+        }
+
+        lex_table.states[state_id].accept_action = action;
+      }
+    }
+  }
+
+  void mark_fragile_tokens(ParseTable *parse_table) {
+    for (ParseState &state : parse_table->states) {
+      for (auto &entry : state.terminal_entries) {
+        Symbol symbol = entry.first;
+        if (symbol.is_token()) {
+          auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
+          if (homonyms != conflict_manager.possible_homonyms.end())
+            for (Symbol::Index homonym : homonyms->second)
+              if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
+                entry.second.reusable = false;
+                break;
+              }
+
+          if (!entry.second.reusable)
+            continue;
+
+          auto extensions = conflict_manager.possible_extensions.find(symbol.index);
+          if (extensions != conflict_manager.possible_extensions.end())
+            for (Symbol::Index extension : extensions->second)
+              if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
+                entry.second.depends_on_lookahead = true;
+                break;
+              }
+        }
+      }
+    }
+  }
+
+  void remove_duplicate_lex_states(ParseTable *parse_table) {
+    for (LexState &state : lex_table.states) {
+      state.accept_action.is_string = false;
+      state.accept_action.precedence = 0;
+    }
+
+    map<LexStateId, LexStateId> replacements;
+
+    while (true) {
+      map<LexStateId, LexStateId> duplicates;
+      for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
+        for (LexStateId j = 0; j < i; j++) {
+          if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) {
+            duplicates.insert({ i, j });
+            break;
+          }
+        }
+      }
+
+      if (duplicates.empty()) break;
+
+      map<size_t, size_t> new_replacements;
+      for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
+        LexStateId new_state_index = i;
+        auto duplicate = duplicates.find(i);
+        if (duplicate != duplicates.end()) {
+          new_state_index = duplicate->second;
+        }
+
+        size_t prior_removed = 0;
+        for (const auto &duplicate : duplicates) {
+          if (duplicate.first >= new_state_index) break;
+          prior_removed++;
+        }
+
+        new_state_index -= prior_removed;
+        new_replacements.insert({ i, new_state_index });
+        replacements.insert({ i, new_state_index });
+        for (auto &replacement : replacements) {
+          if (replacement.second == i) {
+            replacement.second = new_state_index;
+          }
+        }
+      }
+
+      for (auto &state : lex_table.states) {
+        for (auto &entry : state.advance_actions) {
+          auto new_replacement = new_replacements.find(entry.second.state_index);
+          if (new_replacement != new_replacements.end()) {
+            entry.second.state_index = new_replacement->second;
+          }
+        }
+      }
+
+      for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
+        lex_table.states.erase(lex_table.states.begin() + i->first);
+      }
+    }
+
+    for (ParseState &parse_state : parse_table->states) {
+      auto replacement = replacements.find(parse_state.lex_state_id);
+      if (replacement != replacements.end()) {
+        parse_state.lex_state_id = replacement->second;
+      }
+    }
+  }
+
+  LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
+    LexItemSet result;
+    for (const auto &pair : terminals) {
+      Symbol symbol = pair.first;
+      if (symbol.is_token()) {
+        for (const rule_ptr &rule : rules_for_symbol(symbol)) {
+          for (const rule_ptr &separator_rule : separator_rules) {
+            result.entries.insert(LexItem(
+              symbol,
+              Metadata::separator(
+                Seq::build({
+                  separator_rule,
+                  Metadata::main_token(rule) }))));
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
+    if (symbol == rules::END_OF_INPUT())
+      return { CharacterSet().include(0).copy() };
+
+    rule_ptr rule = grammar.variables[symbol.index].rule;
+
+    auto choice = rule->as<Choice>();
+    if (choice)
+      return choice->elements;
+    else
+      return { rule };
+  }
+};
+
+unique_ptr<LexTableBuilder> LexTableBuilder::create(const LexicalGrammar &grammar) {
+  return unique_ptr<LexTableBuilder>(new LexTableBuilderImpl(grammar));
+}
+
+LexTable LexTableBuilder::build(ParseTable *parse_table) {
+  return static_cast<LexTableBuilderImpl *>(this)->build(parse_table);
+}
+
+bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right) {
+  return static_cast<LexTableBuilderImpl *>(this)->detect_conflict(left, right);
+}
+
+}  // namespace build_tables
+}  // namespace tree_sitter
--- a/src/compiler/build_tables/lex_table_builder.h
+++ b/src/compiler/build_tables/lex_table_builder.h
@ -0,0 +1,26 @@
+#ifndef COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
+#define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
+
+#include <memory>
+#include "compiler/lex_table.h"
+
+namespace tree_sitter {
+
+struct ParseTable;
+struct LexicalGrammar;
+
+namespace build_tables {
+
+class LexTableBuilder {
+ public:
+  static std::unique_ptr<LexTableBuilder> create(const LexicalGrammar &);
+  LexTable build(ParseTable *);
+  bool detect_conflict(rules::Symbol::Index, rules::Symbol::Index);
+ protected:
+  LexTableBuilder() = default;
+};
+
+}  // namespace build_tables
+}  // namespace tree_sitter
+
+#endif  // COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
--- a/src/compiler/build_tables/parse_item.h
+++ b/src/compiler/build_tables/parse_item.h
@ -12,8 +12,7 @@
 namespace tree_sitter {
 namespace build_tables {

-class ParseItem {
- public:
+struct ParseItem {
  ParseItem();
  ParseItem(const rules::Symbol &, const Production &, unsigned int);

@ -36,8 +35,7 @@ class ParseItem {
  unsigned int step_index;
 };

-class ParseItemSet {
- public:
+struct ParseItemSet {
  ParseItemSet();
  explicit ParseItemSet(const std::map<ParseItem, LookaheadSet> &);

--- a/src/compiler/build_tables/recovery_tokens.cc
+++ b/src/compiler/build_tables/recovery_tokens.cc
@ -1,89 +0,0 @@
-#include "compiler/build_tables/recovery_tokens.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/visitor.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/metadata.h"
-
-namespace tree_sitter {
-namespace build_tables {
-
-using rules::Symbol;
-using std::set;
-
-template <bool left, bool right>
-class CharacterAggregator : public rules::RuleFn<void> {
-  void apply_to(const rules::Seq *rule) {
-    if (left)
-      apply(rule->left);
-    if (right)
-      apply(rule->right);
-  }
-
-  void apply_to(const rules::Choice *rule) {
-    for (const rule_ptr &element : rule->elements)
-      apply(element);
-  }
-
-  void apply_to(const rules::Repeat *rule) {
-    apply(rule->content);
-  }
-
-  void apply_to(const rules::Metadata *rule) {
-    apply(rule->rule);
-  }
-
-  void apply_to(const rules::CharacterSet *rule) {
-    result.add_set(*rule);
-  }
-
- public:
-  rules::CharacterSet result;
-};
-
-class FirstCharacters : public CharacterAggregator<true, false> {};
-class LastCharacters : public CharacterAggregator<false, true> {};
-class AllCharacters : public CharacterAggregator<true, true> {};
-
-set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
-  set<Symbol> result;
-
-  AllCharacters all_separator_characters;
-  for (const rule_ptr &separator : grammar.separators)
-    all_separator_characters.apply(separator);
-
-  for (size_t i = 0; i < grammar.variables.size(); i++) {
-    const Variable &variable = grammar.variables[i];
-    rule_ptr rule = variable.rule;
-
-    FirstCharacters first_characters;
-    first_characters.apply(variable.rule);
-
-    LastCharacters last_characters;
-    last_characters.apply(variable.rule);
-
-    AllCharacters all_characters;
-    all_characters.apply(variable.rule);
-
-    bool has_distinct_start =
-      !first_characters.result.includes_all &&
-      !first_characters.result.intersects(all_separator_characters.result);
-
-    bool has_distinct_end =
-      !last_characters.result.includes_all &&
-      !last_characters.result.intersects(all_separator_characters.result);
-
-    bool has_no_separators =
-      !all_characters.result.intersects(all_separator_characters.result);
-
-    if ((has_distinct_start && has_distinct_end) || has_no_separators)
-      result.insert(Symbol(i, Symbol::Terminal));
-  }
-
-  return result;
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
--- a/src/compiler/build_tables/recovery_tokens.h
+++ b/src/compiler/build_tables/recovery_tokens.h
@ -1,19 +0,0 @@
-#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
-#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
-
-#include "compiler/rule.h"
-#include "compiler/rules/symbol.h"
-#include <set>
-
-namespace tree_sitter {
-
-struct LexicalGrammar;
-
-namespace build_tables {
-
-std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
-
-}  // namespace build_tables
-}  // namespace tree_sitter
-
-#endif  // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
--- a/src/compiler/build_tables/remove_duplicate_states.h
+++ b/src/compiler/build_tables/remove_duplicate_states.h
@ -1,65 +0,0 @@
-#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
-#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
-
-#include <map>
-#include <vector>
-
-namespace tree_sitter {
-namespace build_tables {
-
-template <typename TableType>
-std::map<size_t, size_t> remove_duplicate_states(TableType *table) {
-  std::map<size_t, size_t> replacements;
-
-  while (true) {
-    std::map<size_t, size_t> duplicates;
-    for (size_t i = 0, size = table->states.size(); i < size; i++)
-      for (size_t j = 0; j < i; j++)
-        if (!duplicates.count(j) && table->merge_state(j, i)) {
-          duplicates.insert({ i, j });
-          break;
-        }
-
-    if (duplicates.empty())
-      break;
-
-    std::map<size_t, size_t> new_replacements;
-    for (size_t i = 0, size = table->states.size(); i < size; i++) {
-      size_t new_state_index = i;
-      auto duplicate = duplicates.find(i);
-      if (duplicate != duplicates.end())
-        new_state_index = duplicate->second;
-
-      size_t prior_removed = 0;
-      for (const auto &duplicate : duplicates) {
-        if (duplicate.first >= new_state_index)
-          break;
-        prior_removed++;
-      }
-
-      new_state_index -= prior_removed;
-      new_replacements.insert({ i, new_state_index });
-      replacements.insert({ i, new_state_index });
-      for (auto &replacement : replacements)
-        if (replacement.second == i)
-          replacement.second = new_state_index;
-    }
-
-    for (auto &state : table->states)
-      state.each_referenced_state([&new_replacements](int64_t *state_index) {
-        auto new_replacement = new_replacements.find(*state_index);
-        if (new_replacement != new_replacements.end())
-          *state_index = new_replacement->second;
-      });
-
-    for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i)
-      table->states.erase(table->states.begin() + i->first);
-  }
-
-  return replacements;
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
-
-#endif  // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -26,8 +26,6 @@ using std::vector;
 using util::escape_char;
 using rules::Symbol;

-static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
-
 static const map<char, string> REPLACEMENTS({
  { '~', "TILDE" },
  { '`', "BQUOTE" },
@ -561,7 +559,7 @@ class CCodeGenerator {
        return { variable.name, variable.type };
      }
      case Symbol::Terminal: {
-        const Variable &variable = lexical_grammar.variables[symbol.index];
+        const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
        return { variable.name, variable.type };
      }
      case Symbol::External:
--- a/src/compiler/generate_code/c_code.h
+++ b/src/compiler/generate_code/c_code.h
@ -7,8 +7,8 @@ namespace tree_sitter {

 struct LexicalGrammar;
 struct SyntaxGrammar;
-class LexTable;
-class ParseTable;
+struct LexTable;
+struct ParseTable;

 namespace generate_code {

--- a/src/compiler/lex_table.cc
+++ b/src/compiler/lex_table.cc
@ -44,35 +44,10 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {

 LexState::LexState() : is_token_start(false) {}

-set<CharacterSet> LexState::expected_inputs() const {
-  set<CharacterSet> result;
-  for (auto &pair : advance_actions)
-    result.insert(pair.first);
-  return result;
-}
-
 bool LexState::operator==(const LexState &other) const {
  return advance_actions == other.advance_actions &&
         accept_action == other.accept_action &&
         is_token_start == other.is_token_start;
 }

-void LexState::each_referenced_state(function<void(LexStateId *)> fn) {
-  for (auto &entry : advance_actions)
-    fn(&entry.second.state_index);
-}
-
-LexStateId LexTable::add_state() {
-  states.push_back(LexState());
-  return states.size() - 1;
-}
-
-LexState &LexTable::state(LexStateId id) {
-  return states[id];
-}
-
-bool LexTable::merge_state(size_t i, size_t j) {
-  return states[i] == states[j];
-}
-
 }  // namespace tree_sitter
--- a/src/compiler/lex_table.h
+++ b/src/compiler/lex_table.h
@ -13,17 +13,9 @@ namespace tree_sitter {

 typedef int64_t LexStateId;

-typedef enum {
-  LexActionTypeError,
-  LexActionTypeAccept,
-  LexActionTypeAcceptFragile,
-  LexActionTypeAdvance
-} LexActionType;
-
 struct AdvanceAction {
  AdvanceAction();
  AdvanceAction(size_t, PrecedenceRange, bool);
-
  bool operator==(const AdvanceAction &other) const;

  LexStateId state_index;
@ -34,7 +26,6 @@ struct AdvanceAction {
 struct AcceptTokenAction {
  AcceptTokenAction();
  AcceptTokenAction(rules::Symbol, int, bool);
-
  bool is_present() const;
  bool operator==(const AcceptTokenAction &action) const;

@ -43,31 +34,17 @@ struct AcceptTokenAction {
  bool is_string;
 };

-}  // namespace tree_sitter
-
-namespace std {}  // namespace std
-
-namespace tree_sitter {
-
-class LexState {
- public:
+struct LexState {
  LexState();
-  std::set<rules::CharacterSet> expected_inputs() const;
  bool operator==(const LexState &) const;
-  void each_referenced_state(std::function<void(LexStateId *)>);

  std::map<rules::CharacterSet, AdvanceAction> advance_actions;
  AcceptTokenAction accept_action;
  bool is_token_start;
 };

-class LexTable {
- public:
-  LexStateId add_state();
-  LexState &state(LexStateId state_id);
+struct LexTable {
  std::vector<LexState> states;
-
-  bool merge_state(size_t i, size_t j);
 };

 }  // namespace tree_sitter
--- a/src/compiler/lexical_grammar.h
+++ b/src/compiler/lexical_grammar.h
@ -9,8 +9,15 @@

 namespace tree_sitter {

+struct LexicalVariable {
+  std::string name;
+  VariableType type;
+  rule_ptr rule;
+  bool is_string;
+};
+
 struct LexicalGrammar {
-  std::vector<Variable> variables;
+  std::vector<LexicalVariable> variables;
  std::vector<rule_ptr> separators;
 };

--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@ -148,13 +148,6 @@ bool ParseState::has_shift_action() const {
  return (!nonterminal_entries.empty());
 }

-set<Symbol> ParseState::expected_inputs() const {
-  set<Symbol> result;
-  for (auto &entry : terminal_entries)
-    result.insert(entry.first);
-  return result;
-}
-
 void ParseState::each_referenced_state(function<void(ParseStateId *)> fn) {
  for (auto &entry : terminal_entries)
    for (ParseAction &action : entry.second.actions)
@ -169,18 +162,6 @@ bool ParseState::operator==(const ParseState &other) const {
    nonterminal_entries == other.nonterminal_entries;
 }

-set<Symbol> ParseTable::all_symbols() const {
-  set<Symbol> result;
-  for (auto &pair : symbols)
-    result.insert(pair.first);
-  return result;
-}
-
-ParseStateId ParseTable::add_state() {
-  states.push_back(ParseState());
-  return states.size() - 1;
-}
-
 ParseAction &ParseTable::add_terminal_action(ParseStateId state_id,
                                             Symbol lookahead,
                                             ParseAction action) {
@ -201,58 +182,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id,
  states[state_id].nonterminal_entries[lookahead] = next_state_id;
 }

-static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
-  for (const auto &pair : state.terminal_entries)
-    if (pair.second == entry)
-      return true;
-  return false;
-}
-
-bool ParseTable::merge_state(size_t i, size_t j) {
-  ParseState &state = states[i];
-  ParseState &other = states[j];
-
-  if (state.nonterminal_entries != other.nonterminal_entries)
-    return false;
-
-  for (auto &entry : state.terminal_entries) {
-    Symbol lookahead = entry.first;
-    const vector<ParseAction> &actions = entry.second.actions;
-
-    const auto &other_entry = other.terminal_entries.find(lookahead);
-    if (other_entry == other.terminal_entries.end()) {
-      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
-        return false;
-      if (actions.back().type != ParseActionTypeReduce)
-        return false;
-      if (!has_entry(other, entry.second))
-        return false;
-    } else if (entry.second != other_entry->second) {
-      return false;
-    }
-  }
-
-  set<Symbol> symbols_to_merge;
-
-  for (auto &entry : other.terminal_entries) {
-    Symbol lookahead = entry.first;
-    const vector<ParseAction> &actions = entry.second.actions;
-
-    if (!state.terminal_entries.count(lookahead)) {
-      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
-        return false;
-      if (actions.back().type != ParseActionTypeReduce)
-        return false;
-      if (!has_entry(state, entry.second))
-        return false;
-      symbols_to_merge.insert(lookahead);
-    }
-  }
-
-  for (const Symbol &lookahead : symbols_to_merge)
-    state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
-
-  return true;
-}
-
 }  // namespace tree_sitter
--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@ -23,13 +23,11 @@ enum ParseActionType {
  ParseActionTypeRecover,
 };

-class ParseAction {
+struct ParseAction {
+  ParseAction();
  ParseAction(ParseActionType type, ParseStateId state_index,
              rules::Symbol symbol, size_t consumed_symbol_count,
              const Production *);
-
- public:
-  ParseAction();
  static ParseAction Accept();
  static ParseAction Error();
  static ParseAction Shift(ParseStateId state_index);
@ -39,7 +37,6 @@ class ParseAction {
  static ParseAction ShiftExtra();
  bool operator==(const ParseAction &) const;
  bool operator<(const ParseAction &) const;
-
  rules::Associativity associativity() const;
  int precedence() const;

@ -47,30 +44,26 @@ class ParseAction {
  bool extra;
  bool fragile;
  ParseStateId state_index;
-
  rules::Symbol symbol;
  size_t consumed_symbol_count;
  const Production *production;
 };

 struct ParseTableEntry {
-  std::vector<ParseAction> actions;
-  bool reusable;
-  bool depends_on_lookahead;
-
  ParseTableEntry();
  ParseTableEntry(const std::vector<ParseAction> &, bool, bool);
  bool operator==(const ParseTableEntry &other) const;
-
  inline bool operator!=(const ParseTableEntry &other) const {
    return !operator==(other);
  }
+
+  std::vector<ParseAction> actions;
+  bool reusable;
+  bool depends_on_lookahead;
 };

-class ParseState {
- public:
+struct ParseState {
  ParseState();
-  std::set<rules::Symbol> expected_inputs() const;
  bool operator==(const ParseState &) const;
  bool merge(const ParseState &);
  void each_referenced_state(std::function<void(ParseStateId *)>);
@ -87,18 +80,12 @@ struct ParseTableSymbolMetadata {
  bool structural;
 };

-class ParseTable {
- public:
-  std::set<rules::Symbol> all_symbols() const;
-  ParseStateId add_state();
+struct ParseTable {
  ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
  void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
-  bool merge_state(size_t i, size_t j);

  std::vector<ParseState> states;
  std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
-
-  std::set<rules::Symbol> mergeable_symbols;
 };

 }  // namespace tree_sitter
--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@ -41,10 +41,17 @@ class ExpandRepeats : public rules::IdentityRuleFn {
    string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count);
    Symbol repeat_symbol(offset + index, Symbol::NonTerminal);
    existing_repeats.push_back({ rule->copy(), repeat_symbol });
-    aux_rules.push_back(
-      Variable(helper_rule_name, VariableTypeAuxiliary,
-               Choice::build({ Seq::build({ repeat_symbol.copy(), inner_rule }),
-                               inner_rule })));
+    aux_rules.push_back(Variable{
+      helper_rule_name,
+      VariableTypeAuxiliary,
+      Choice::build({
+        Seq::build({
+          repeat_symbol.copy(),
+          inner_rule,
+        }),
+        inner_rule,
+      })
+    });
    return repeat_symbol.copy();
  }

--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -67,11 +67,11 @@ pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar)
  LexicalGrammar result;
  ExpandTokens expander;

-  for (const Variable &variable : grammar.variables) {
+  for (const LexicalVariable &variable : grammar.variables) {
    auto rule = expander.apply(variable.rule);
    if (expander.error.type)
      return { result, expander.error };
-    result.variables.push_back(Variable(variable.name, variable.type, rule));
+    result.variables.push_back({variable.name, variable.type, rule, variable.is_string});
  }

  for (auto &sep : grammar.separators) {
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
 class TokenExtractor : public rules::IdentityRuleFn {
  using rules::IdentityRuleFn::apply_to;

-  rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
+  rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) {
    for (size_t i = 0; i < tokens.size(); i++)
      if (tokens[i].rule->operator==(*input)) {
        token_usage_counts[i]++;
@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn {

    rule_ptr rule = input->copy();
    size_t index = tokens.size();
-    tokens.push_back(Variable(token_description(rule), entry_type, rule));
+    tokens.push_back({token_description(rule), entry_type, rule, is_string});
    token_usage_counts.push_back(1);
    return make_shared<Symbol>(index, Symbol::Terminal);
  }

  rule_ptr apply_to(const rules::String *rule) {
-    return apply_to_token(rule, VariableTypeAnonymous);
+    return apply_to_token(rule, VariableTypeAnonymous, true);
  }

  rule_ptr apply_to(const rules::Pattern *rule) {
-    return apply_to_token(rule, VariableTypeAuxiliary);
+    return apply_to_token(rule, VariableTypeAuxiliary, false);
  }

  rule_ptr apply_to(const rules::Metadata *rule) {
-    if (rule->params.is_token)
-      return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
-    else
+    if (rule->params.is_token) {
+      return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false);
+    } else {
      return rules::IdentityRuleFn::apply_to(rule);
+    }
  }

 public:
  vector<size_t> token_usage_counts;
-  vector<Variable> tokens;
+  vector<LexicalVariable> tokens;
 };

 static CompileError extra_token_error(const string &message) {
@ -106,8 +107,11 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
   */
  vector<Variable> processed_variables;
  for (const Variable &variable : grammar.variables)
-    processed_variables.push_back(
-      Variable(variable.name, variable.type, extractor.apply(variable.rule)));
+    processed_variables.push_back(Variable{
+      variable.name,
+      variable.type,
+      extractor.apply(variable.rule)
+    });
  lexical_grammar.variables = extractor.tokens;

  /*
@ -139,8 +143,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(

  for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
    ConflictSet new_conflict_set;
-    for (const Symbol &symbol : conflict_set)
+    for (const Symbol &symbol : conflict_set) {
      new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
+    }
    syntax_grammar.expected_conflicts.insert(new_conflict_set);
  }

@ -154,7 +159,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
  for (const rule_ptr &rule : grammar.extra_tokens) {
    int i = 0;
    bool used_elsewhere_in_grammar = false;
-    for (const Variable &variable : lexical_grammar.variables) {
+    for (const LexicalVariable &variable : lexical_grammar.variables) {
      if (variable.rule->operator==(*rule)) {
        syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
        used_elsewhere_in_grammar = true;
@ -171,9 +176,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    }

    auto symbol = rule->as<Symbol>();
-    if (!symbol)
+    if (!symbol) {
      return make_tuple(syntax_grammar, lexical_grammar,
                        extra_token_error(rule->to_string()));
+    }

    Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
    if (new_symbol.is_non_terminal()) {
--- a/src/compiler/prepare_grammar/flatten_grammar.cc
+++ b/src/compiler/prepare_grammar/flatten_grammar.cc
@ -25,8 +25,11 @@ class FlattenRule : public rules::RuleFn<void> {
  Production production;

  void apply_to(const rules::Symbol *sym) {
-    production.push_back(ProductionStep(*sym, precedence_stack.back(),
-                                        associativity_stack.back()));
+    production.push_back(ProductionStep{
+      *sym,
+      precedence_stack.back(),
+      associativity_stack.back()
+    });
  }

  void apply_to(const rules::Metadata *metadata) {
@ -85,7 +88,7 @@ SyntaxVariable flatten_rule(const Variable &variable) {
    }
  }

-  return SyntaxVariable(variable.name, variable.type, productions);
+  return SyntaxVariable{variable.name, variable.type, productions};
 }

 pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &grammar) {
--- a/src/compiler/prepare_grammar/normalize_rules.cc
+++ b/src/compiler/prepare_grammar/normalize_rules.cc
@ -8,7 +8,7 @@ namespace prepare_grammar {
 LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) {
  LexicalGrammar result(input_grammar);

-  for (Variable &variable : result.variables) {
+  for (LexicalVariable &variable : result.variables) {
    variable.rule = rules::Choice::build(extract_choices(variable.rule));
  }

--- a/src/compiler/syntax_grammar.cc
+++ b/src/compiler/syntax_grammar.cc
@ -7,20 +7,6 @@

 namespace tree_sitter {

-using std::string;
-using std::to_string;
-using std::pair;
-using std::vector;
-using std::set;
-
-SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
-                               const vector<Production> &productions)
-    : name(name), productions(productions), type(type) {}
-
-ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
-                               rules::Associativity associativity)
-    : symbol(symbol), precedence(precedence), associativity(associativity) {}
-
 bool ExternalToken::operator==(const ExternalToken &other) const {
  return name == other.name && type == other.type &&
    corresponding_internal_token == other.corresponding_internal_token;
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@ -11,15 +11,14 @@
 namespace tree_sitter {

 struct ExternalToken {
+  bool operator==(const ExternalToken &) const;
+
  std::string name;
  VariableType type;
  rules::Symbol corresponding_internal_token;
-
-  bool operator==(const ExternalToken &) const;
 };

 struct ProductionStep {
-  ProductionStep(const rules::Symbol &, int, rules::Associativity);
  bool operator==(const ProductionStep &) const;

  rules::Symbol symbol;
@ -30,12 +29,9 @@ struct ProductionStep {
 typedef std::vector<ProductionStep> Production;

 struct SyntaxVariable {
-  SyntaxVariable(const std::string &, VariableType,
-                 const std::vector<Production> &);
-
  std::string name;
-  std::vector<Production> productions;
  VariableType type;
+  std::vector<Production> productions;
 };

 typedef std::set<rules::Symbol> ConflictSet;
--- a/src/compiler/variable.cc
+++ b/src/compiler/variable.cc
@ -1,11 +0,0 @@
-#include "compiler/variable.h"
-#include <string>
-
-namespace tree_sitter {
-
-using std::string;
-
-Variable::Variable(const string &name, VariableType type, const rule_ptr &rule)
-    : name(name), rule(rule), type(type) {}
-
-}  // namespace tree_sitter
--- a/src/compiler/variable.h
+++ b/src/compiler/variable.h
@ -15,11 +15,9 @@ enum VariableType {
 };

 struct Variable {
-  Variable(const std::string &, VariableType, const rule_ptr &);
-
  std::string name;
-  rule_ptr rule;
  VariableType type;
+  rule_ptr rule;
 };

 }  // namespace tree_sitter
				`@ -0,0 +1 @@`
				This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an internal token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field.
				`@ -0,0 +1 @@`
				This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`.