Rename spec -> test

'Test' is a lot more straightforward of a name.
2017-03-09 20:40:01 -08:00 · 2017-03-09 20:40:01 -08:00 · 6dc0ff359d
commit 6dc0ff359d
parent 7d8daf573e
109 changed files with 44 additions and 44 deletions
--- a/test/compiler/build_tables/lex_conflict_manager_test.cc
+++ b/test/compiler/build_tables/lex_conflict_manager_test.cc
@ -0,0 +1,89 @@
+#include "test_helper.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "compiler/parse_table.h"
+#include "compiler/build_tables/lex_conflict_manager.h"
+#include "compiler/build_tables/lex_item.h"
+
+using namespace rules;
+using namespace build_tables;
+
+START_TEST
+
+describe("LexConflictManager::resolve(new_action, old_action)", []() {
+  LexConflictManager conflict_manager;
+  bool update;
+  Symbol sym1(0, Symbol::Terminal);
+  Symbol sym2(1, Symbol::Terminal);
+  Symbol sym3(2, Symbol::Terminal);
+  Symbol sym4(3, Symbol::Terminal);
+  LexItemSet item_set({ LexItem(sym4, blank() )});
+
+  before_each([&]() {
+    conflict_manager = LexConflictManager();
+  });
+
+  it("favors advance actions over empty accept token actions", [&]() {
+    update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction());
+    AssertThat(update, IsTrue());
+  });
+
+  describe("accept-token/accept-token conflicts", [&]() {
+    describe("when the tokens' precedence values differ", [&]() {
+      it("favors the token with higher precedence", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 2, false), AcceptTokenAction(sym2, 1, false));
+        AssertThat(update, IsTrue());
+      });
+
+      it("adds the preferred token as a possible homonym for the discarded one", [&]() {
+        conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
+        AssertThat(conflict_manager.possible_homonyms[sym2.index], Contains(sym1.index));
+      });
+    });
+
+    describe("when one token is string-based and the other is regexp-based", [&]() {
+      it("favors the string-based token", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, true));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, true), AcceptTokenAction(sym1, 0, false));
+        AssertThat(update, IsTrue());
+      });
+    });
+
+    describe("when the tokens have equal precedence", [&]() {
+      it("favors the token listed earlier in the grammar", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, false), AcceptTokenAction(sym1, 0, false));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, false));
+        AssertThat(update, IsTrue());
+      });
+    });
+  });
+
+  describe("advance/accept-token conflicts", [&]() {
+    describe("when the token to accept has higher precedence", [&]() {
+      it("prefers the accept-token action", [&]() {
+        AssertThat(conflict_manager.possible_extensions, IsEmpty());
+        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
+        AssertThat(update, IsFalse());
+        AssertThat(conflict_manager.possible_extensions, IsEmpty());
+      });
+    });
+
+    describe("when the token to accept does not have a higher precedence", [&]() {
+      it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
+        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
+        AssertThat(update, IsTrue());
+        AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
+      });
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/lex_item_test.cc
+++ b/test/compiler/build_tables/lex_item_test.cc
@ -0,0 +1,514 @@
+#include "test_helper.h"
+#include "compiler/build_tables/lex_item.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/rules.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+using namespace rules;
+using namespace build_tables;
+typedef LexItemSet::Transition Transition;
+
+START_TEST
+
+describe("LexItem", []() {
+  describe("completion_status()", [&]() {
+    it("indicates whether the item is done and its precedence", [&]() {
+      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
+      AssertThat(item1.completion_status().is_done, IsFalse());
+      AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
+
+      MetadataParams params;
+      params.precedence = 3;
+      params.has_precedence = true;
+      params.is_string = 1;
+      LexItem item2(Symbol(0, Symbol::Terminal), choice({
+        metadata(blank(), params),
+        character({ 'a', 'b', 'c' })
+      }));
+
+      AssertThat(item2.completion_status().is_done, IsTrue());
+      AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
+
+      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
+      AssertThat(item3.completion_status().is_done, IsTrue());
+      AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
+    });
+  });
+});
+
+describe("LexItemSet::transitions()", [&]() {
+  it("handles single characters", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("marks transitions that are within the main token (as opposed to separators)", [&]() {
+    MetadataParams params;
+    params.is_main_token = true;
+
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)),
+            }),
+            PrecedenceRange(),
+            true
+          }
+        }
+      })));
+  });
+
+  it("handles sequences", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        character({ 'w' }),
+        character({ 'x' }),
+        character({ 'y' }),
+        character({ 'z' }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('w'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'x' }),
+                character({ 'y' }),
+                character({ 'z' }),
+              })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles sequences with nested precedence", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        prec(3, seq({
+          character({ 'v' }),
+          prec(4, seq({
+            character({ 'w' }),
+            character({ 'x' }) })),
+          character({ 'y' }) })),
+        character({ 'z' }),
+      })),
+    });
+
+    auto transitions = item_set.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('v'),
+          Transition{
+            // The outer precedence is now 'active', because we are within its
+            // contained rule.
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, seq({
+                  prec(4, seq({
+                    character({ 'w' }),
+                    character({ 'x' }) })),
+                  character({ 'y' }) })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // No precedence is applied upon entering a rule.
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set2 = transitions[CharacterSet().include('v')].destination;
+    transitions = item_set2.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('w'),
+          Transition{
+            // The inner precedence is now 'active'
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, seq({
+                  active_prec(4, character({ 'x' })),
+                  character({ 'y' }) })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // The outer precedence is applied.
+            PrecedenceRange(3),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set3 = transitions[CharacterSet().include('w')].destination;
+    transitions = item_set3.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, character({ 'y' })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // The inner precedence is applied.
+            PrecedenceRange(4),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set4 = transitions[CharacterSet().include('x')].destination;
+    transitions = item_set4.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('y'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
+            }),
+            PrecedenceRange(3),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles sequences where the left hand side can be blank", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        choice({
+          character({ 'x' }),
+          blank(),
+        }),
+        character({ 'y' }),
+        character({ 'z' }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'y' }),
+                character({ 'z' }),
+              })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('y'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles blanks", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+    });
+
+    AssertThat(item_set.transitions(), IsEmpty());
+  });
+
+  it("handles repeats", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({
+        character({ 'a' }),
+        character({ 'b' }),
+      }))),
+      LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'b' }),
+                repeat1(seq({
+                  character({ 'a' }),
+                  character({ 'b' }),
+                }))
+              })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('c'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+              LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles repeats with precedence", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' }))))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())),
+            }),
+            PrecedenceRange(-1),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles choices between overlapping character sets", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
+        active_prec(2, seq({
+          character({ 'a', 'b', 'c', 'd'  }),
+          character({ 'x' }),
+        })),
+        active_prec(3, seq({
+          character({ 'c', 'd', 'e', 'f' }),
+          character({ 'y' }),
+        })),
+      }))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a', 'b'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+            }),
+            PrecedenceRange(2),
+            false
+          }
+        },
+        {
+          CharacterSet().include('c', 'd'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
+            }),
+            PrecedenceRange(2, 3),
+            false
+          }
+        },
+        {
+          CharacterSet().include('e', 'f'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
+            }),
+            PrecedenceRange(3),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles choices between a subset and a superset of characters", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
+        seq({
+          character({ 'b', 'c', 'd' }),
+          character({ 'x' }),
+        }),
+        seq({
+          character({ 'a', 'b', 'c', 'd', 'e', 'f' }),
+          character({ 'y' }),
+        }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a').include('e', 'f'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('b', 'd'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles choices between whitelisted and blacklisted character sets", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        choice({
+          character({ '/' }, false),
+          seq({
+            character({ '\\' }),
+            character({ '/' }),
+          }),
+        }),
+        character({ '/' }),
+      }))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include_all().exclude('/').exclude('\\'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('\\'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles different items with overlapping character sets", [&]() {
+    LexItemSet set1({
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
+      LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' }))
+    });
+
+    AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({
+      {
+        CharacterSet().include('a', 'd'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+      {
+        CharacterSet().include('e', 'f'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+      {
+        CharacterSet().include('g', 'i'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/parse_item_set_builder_test.cc
+++ b/test/compiler/build_tables/parse_item_set_builder_test.cc
@ -0,0 +1,134 @@
+#include "test_helper.h"
+#include "compiler/syntax_grammar.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/build_tables/parse_item_set_builder.h"
+#include "compiler/build_tables/lookahead_set.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "helpers/rule_helpers.h"
+
+using namespace build_tables;
+using namespace rules;
+
+START_TEST
+
+describe("ParseItemSetBuilder", []() {
+  vector<LexicalVariable> lexical_variables;
+  for (size_t i = 0; i < 20; i++) {
+    lexical_variables.push_back({
+      "token_" + to_string(i),
+      VariableTypeNamed,
+      blank(),
+      false
+    });
+  }
+
+  LexicalGrammar lexical_grammar{lexical_variables, {}};
+
+  it("adds items at the beginnings of referenced rules", [&]() {
+    SyntaxGrammar grammar{{
+      SyntaxVariable{"rule0", VariableTypeNamed, {
+        Production({
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
+        }),
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
+        Production({
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
+        }),
+        Production({
+          {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
+        })
+      }},
+      SyntaxVariable{"rule2", VariableTypeNamed, {
+        Production({
+          {Symbol(14, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(15, Symbol::Terminal), 0, AssociativityNone},
+        })
+      }},
+    }, {}, {}, {}};
+
+    auto production = [&](int variable_index, int production_index) -> const Production & {
+      return grammar.variables[variable_index].productions[production_index];
+    };
+
+    ParseItemSet item_set({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
+      }
+    });
+
+    ParseItemSetBuilder item_set_builder(grammar, lexical_grammar);
+    item_set_builder.apply_transitive_closure(&item_set);
+
+    AssertThat(item_set, Equals(ParseItemSet({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+        },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+    })));
+  });
+
+  it("handles rules with empty productions", [&]() {
+    SyntaxGrammar grammar{{
+      SyntaxVariable{"rule0", VariableTypeNamed, {
+        Production({
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
+        }),
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
+        Production({
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
+        }),
+        Production({})
+      }},
+    }, {}, {}, {}};
+
+    auto production = [&](int variable_index, int production_index) -> const Production & {
+      return grammar.variables[variable_index].productions[production_index];
+    };
+
+    ParseItemSet item_set({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
+      }
+    });
+
+    ParseItemSetBuilder item_set_builder(grammar, lexical_grammar);
+    item_set_builder.apply_transitive_closure(&item_set);
+
+    AssertThat(item_set, Equals(ParseItemSet({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/rule_can_be_blank_test.cc
+++ b/test/compiler/build_tables/rule_can_be_blank_test.cc
@ -0,0 +1,60 @@
+#include "test_helper.h"
+#include "compiler/build_tables/rule_can_be_blank.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/rules.h"
+#include "helpers/rule_helpers.h"
+
+using namespace rules;
+using build_tables::rule_can_be_blank;
+
+START_TEST
+
+describe("rule_can_be_blank", [&]() {
+  rule_ptr rule;
+
+  it("returns false for basic rules", [&]() {
+    AssertThat(rule_can_be_blank(i_sym(3)), IsFalse());
+    AssertThat(rule_can_be_blank(str("x")), IsFalse());
+    AssertThat(rule_can_be_blank(pattern("x")), IsFalse());
+  });
+
+  it("returns true for blanks", [&]() {
+    AssertThat(rule_can_be_blank(blank()), IsTrue());
+  });
+
+  it("returns true for repeats", [&]() {
+    AssertThat(rule_can_be_blank(repeat(str("x"))), IsTrue());
+  });
+
+  it("returns true for choices iff one or more sides can be blank", [&]() {
+    rule = choice({ sym("x"), blank() });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = choice({ blank(), sym("x") });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = choice({ sym("x"), sym("y") });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+  });
+
+  it("returns true for sequences iff both sides can be blank", [&]() {
+    rule = seq({ blank(), str("x") });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+
+    rule = seq({ str("x"), blank() });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+
+    rule = seq({ blank(), choice({ sym("x"), blank() }) });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+  });
+
+  it("ignores metadata rules", [&]() {
+    rule = make_shared<rules::Metadata>(blank(), MetadataParams());
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = make_shared<rules::Metadata>(sym("one"), MetadataParams());
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/expand_repeats_test.cc
+++ b/test/compiler/prepare_grammar/expand_repeats_test.cc
@ -0,0 +1,171 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/prepare_grammar/expand_repeats.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::InitialSyntaxGrammar;
+using prepare_grammar::expand_repeats;
+
+describe("expand_repeats", []() {
+  it("replaces repeat rules with pairs of recursive rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(0))},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(1)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(0) }),
+        i_token(0),
+      })},
+    }));
+  });
+
+  it("replaces repeats inside of sequences", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          i_token(10),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
+        i_token(10),
+        i_sym(1),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(11) }),
+        i_token(11)
+      })},
+    }));
+  });
+
+  it("replaces repeats inside of choices", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          i_token(10),
+          repeat1(i_token(11))
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
+        i_token(10),
+        i_sym(1),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+
+  it("does not create redundant auxiliary rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          seq({ i_token(1), repeat1(i_token(4)) }),
+          seq({ i_token(2), repeat1(i_token(4)) }),
+        })},
+        Variable{"rule1", VariableTypeNamed, seq({
+          i_token(3),
+          repeat1(i_token(4))
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
+        seq({ i_token(1), i_sym(2) }),
+        seq({ i_token(2), i_sym(2) }),
+      })},
+      Variable{"rule1", VariableTypeNamed, seq({
+        i_token(3),
+        i_sym(2),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(4) }),
+        i_token(4),
+      })},
+    }));
+  });
+
+  it("can replace multiple repeats in the same rule", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          repeat1(i_token(10)),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
+        i_sym(1),
+        i_sym(2),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(10) }),
+        i_token(10),
+      })},
+      Variable{"rule0_repeat2", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+
+  it("can replace repeats in multiple rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(10))},
+        Variable{"rule1", VariableTypeNamed, repeat1(i_token(11))},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(2)},
+      Variable{"rule1", VariableTypeNamed, i_sym(3)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(10) }),
+        i_token(10),
+      })},
+      Variable{"rule1_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(3), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/expand_tokens_test.cc
+++ b/test/compiler/prepare_grammar/expand_tokens_test.cc
@ -0,0 +1,169 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/prepare_grammar/expand_tokens.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::expand_tokens;
+
+describe("expand_tokens", []() {
+  MetadataParams string_token_params;
+  string_token_params.is_string = true;
+  string_token_params.is_token = true;
+
+  describe("string rules", [&]() {
+    it("replaces strings with sequences of character sets", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError::none()));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            metadata(seq({
+              character({ 'x' }),
+              character({ 'y' }),
+              character({ 'z' }),
+            }), string_token_params),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
+    });
+
+    it("handles strings containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            str("\u03B1 \u03B2"),
+            false
+          },
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          metadata(seq({
+            character({ 945 }),
+            character({ ' ' }),
+            character({ 946 }),
+          }), string_token_params),
+          false
+        }
+      }));
+    });
+  });
+
+  describe("regexp rules", [&]() {
+    it("replaces regexps with the equivalent rule tree", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError::none()));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            repeat(character({ 'x' })),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
+    });
+
+    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            pattern("[^\u03B1-\u03B4]*"),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          repeat(character({ 945, 946, 947, 948 }, false)),
+          false
+        }
+      }));
+    });
+
+    it("returns an error when the grammar contains an invalid regex", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              pattern("("),
+              str("xyz"),
+              pattern("["),
+            }),
+            false
+          },
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError(TSCompileErrorTypeInvalidRegex, "unmatched open paren")));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/extract_choices_test.cc
+++ b/test/compiler/prepare_grammar/extract_choices_test.cc
@ -0,0 +1,106 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/extract_choices.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::extract_choices;
+
+class rule_vector : public vector<rule_ptr> {
+ public:
+  bool operator==(const vector<rule_ptr> &other) const {
+    if (this->size() != other.size()) return false;
+    for (size_t i = 0; i < this->size(); i++) {
+      auto rule = this->operator[](i);
+      auto other_rule = other[i];
+      if (!rule->operator==(*rule))
+        return false;
+    }
+    return true;
+  }
+
+  rule_vector(const initializer_list<rule_ptr> &list) :
+      vector<rule_ptr>(list) {}
+};
+
+describe("extract_choices", []() {
+  it("expands rules containing choices into multiple rules", [&]() {
+    auto rule = seq({
+      sym("a"),
+      choice({ sym("b"), sym("c"), sym("d") }),
+      sym("e")
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({ sym("a"), sym("b"), sym("e") }),
+      seq({ sym("a"), sym("c"), sym("e") }),
+      seq({ sym("a"), sym("d"), sym("e") }),
+    })));
+  });
+
+  it("handles metadata rules", [&]() {
+    auto rule = prec(5, choice({ sym("b"), sym("c"), sym("d") }));
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      prec(5, sym("b")),
+      prec(5, sym("c")),
+      prec(5, sym("d")),
+    })));
+  });
+
+  it("handles nested choices", [&]() {
+    auto rule = choice({
+      seq({ choice({ sym("a"), sym("b") }), sym("c") }),
+      sym("d")
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({ sym("a"), sym("c") }),
+      seq({ sym("b"), sym("c") }),
+      sym("d"),
+    })));
+  });
+
+  it("handles blank rules", [&]() {
+    AssertThat(extract_choices(blank()), Equals(rule_vector({
+      blank(),
+    })));
+  });
+
+  it("does not move choices outside of repeats", [&]() {
+    auto rule = seq({
+      choice({ sym("a"), sym("b") }),
+      repeat1(seq({
+        sym("c"),
+        choice({
+          sym("d"),
+          sym("e"),
+        }),
+        sym("f"),
+      })),
+      sym("g"),
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({
+        sym("a"),
+        repeat1(choice({
+          seq({ sym("c"), sym("d"), sym("f") }),
+          seq({ sym("c"), sym("e"), sym("f") }),
+        })),
+        sym("g"),
+      }),
+      seq({
+        sym("b"),
+        repeat1(choice({
+          seq({ sym("c"), sym("d"), sym("f") }),
+          seq({ sym("c"), sym("e"), sym("f") }),
+        })),
+        sym("g"),
+      }),
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/extract_tokens_test.cc
+++ b/test/compiler/prepare_grammar/extract_tokens_test.cc
@ -0,0 +1,276 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/prepare_grammar/interned_grammar.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/prepare_grammar/extract_tokens.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::extract_tokens;
+using prepare_grammar::InternedGrammar;
+using prepare_grammar::InitialSyntaxGrammar;
+
+describe("extract_tokens", []() {
+  it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, repeat1(seq({
+          str("ab"),
+          pattern("cd*"),
+          choice({
+            i_sym(1),
+            i_sym(2),
+            token(repeat1(choice({ str("ef"), str("gh") }))),
+          }),
+        }))},
+        Variable{"rule_B", VariableTypeNamed, pattern("ij+")},
+        Variable{"rule_C", VariableTypeNamed, choice({ str("kl"), blank() })},
+        Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(3))},
+      },
+      {},
+      {},
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+    CompileError error = get<2>(result);
+
+    AssertThat(error, Equals(CompileError::none()));
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable>{
+      Variable{"rule_A", VariableTypeNamed, repeat1(seq({
+
+        // This string is now the first token in the lexical grammar.
+        i_token(0),
+
+        // This pattern is now the second rule in the lexical grammar.
+        i_token(1),
+
+        choice({
+          // Rule 1, which this symbol pointed to, has been moved to the
+          // lexical grammar.
+          i_token(3),
+
+          // This symbol's index has been decremented, because a previous rule
+          // was moved to the lexical grammar.
+          i_sym(1),
+
+          // This token rule is now the third rule in the lexical grammar.
+          i_token(2),
+        }),
+      }))},
+
+      Variable{"rule_C", VariableTypeNamed, choice({ i_token(4), blank() })},
+      Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(2))},
+    }));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
+      // Strings become anonymous rules.
+      LexicalVariable{"ab", VariableTypeAnonymous, str("ab"), true},
+
+      // Patterns become hidden rules.
+      LexicalVariable{"/cd*/", VariableTypeAuxiliary, pattern("cd*"), false},
+
+      // Rules marked as tokens become hidden rules.
+      LexicalVariable{"/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
+        str("ef"),
+        str("gh")
+      })), false},
+
+      // This named rule was moved wholesale to the lexical grammar.
+      LexicalVariable{"rule_B", VariableTypeNamed, pattern("ij+"), false},
+
+      // Strings become anonymous rules.
+      LexicalVariable{"kl", VariableTypeAnonymous, str("kl"), true},
+    })));
+  });
+
+  it("does not create duplicate tokens in the lexical grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, seq({
+          str("ab"),
+          i_sym(0),
+          str("ab"),
+        })},
+      },
+      {},
+      {},
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
+      Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
+    }));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+    }))
+  });
+
+  it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{{
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })},
+      Variable{"rule_B", VariableTypeNamed, str("cd")},
+      Variable{"rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })},
+    }, {}, {}, {}});
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })},
+      Variable{"rule_B", VariableTypeNamed, i_token(1)},
+      Variable{"rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })},
+    })));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+      LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
+      LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
+    }));
+  });
+
+  it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, str("ok")},
+        Variable{"rule_B", VariableTypeNamed, repeat(i_sym(0))},
+        Variable{"rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))},
+      },
+      {
+        str(" ")
+      },
+      {
+        { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) }
+      },
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+
+    AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
+    AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
+      { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) },
+    })));
+  });
+
+  describe("handling extra tokens", [&]() {
+    it("adds inline extra tokens to the lexical grammar's separators", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+        },
+        {
+          str("y"),
+          pattern("\\s+"),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+
+      AssertThat(get<1>(result).separators.size(), Equals<size_t>(2));
+      AssertThat(get<1>(result).separators[0], EqualsPointer(str("y")));
+      AssertThat(get<1>(result).separators[1], EqualsPointer(pattern("\\s+")));
+
+      AssertThat(get<0>(result).extra_tokens, IsEmpty());
+    });
+
+    it("handles inline extra tokens that match tokens in the grammar", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+        },
+        {
+          str("y"),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+      AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, Symbol::Terminal) })));
+    });
+
+    it("updates extra symbols according to the new symbol numbers", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+          Variable{"rule_C", VariableTypeNamed, str("z")},
+        },
+        {
+          i_sym(2),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
+        { Symbol(3, Symbol::Terminal) },
+      })));
+
+      AssertThat(get<1>(result).separators, IsEmpty());
+    });
+
+    it("returns an error if any extra tokens are non-token symbols", [&]() {
+      auto result = extract_tokens(InternedGrammar{{
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
+      }, { i_sym(1) }, {}, {}});
+
+      AssertThat(get<2>(result), !Equals(CompileError::none()));
+      AssertThat(get<2>(result), Equals(
+        CompileError(TSCompileErrorTypeInvalidExtraToken,
+                         "Not a token: rule_B")));
+    });
+
+    it("returns an error if any extra tokens are non-token rules", [&]() {
+      auto result = extract_tokens(InternedGrammar{{
+        Variable{"rule_A", VariableTypeNamed, str("x")},
+        Variable{"rule_B", VariableTypeNamed, str("y")},
+      }, { choice({ i_sym(1), blank() }) }, {}, {}});
+
+      AssertThat(get<2>(result), !Equals(CompileError::none()));
+      AssertThat(get<2>(result), Equals(CompileError(
+        TSCompileErrorTypeInvalidExtraToken,
+        "Not a token: (choice (non-terminal 1) (blank))"
+      )));
+    });
+  });
+
+  it("returns an error if an external token has the same name as a non-terminal rule", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
+      },
+      {},
+      {},
+      {
+        ExternalToken {"rule_A", VariableTypeNamed, Symbol(0, Symbol::NonTerminal)}
+      }
+    });
+
+    AssertThat(get<2>(result), Equals(CompileError(
+      TSCompileErrorTypeInvalidExternalToken,
+      "Name 'rule_A' cannot be used for both an external token and a non-terminal rule"
+    )));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/flatten_grammar_test.cc
+++ b/test/compiler/prepare_grammar/flatten_grammar_test.cc
@ -0,0 +1,89 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/flatten_grammar.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/syntax_grammar.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::flatten_rule;
+
+describe("flatten_grammar", []() {
+  it("associates each symbol with the precedence and associativity binding it to its successor", [&]() {
+    SyntaxVariable result = flatten_rule(Variable{
+      "test",
+      VariableTypeNamed,
+      seq({
+        i_sym(1),
+        prec_left(101, seq({
+          i_sym(2),
+          choice({
+            prec_right(102, seq({
+              i_sym(3),
+              i_sym(4)
+            })),
+            i_sym(5),
+          }),
+          i_sym(6),
+        })),
+        i_sym(7),
+      })
+    });
+
+    AssertThat(result.name, Equals("test"));
+    AssertThat(result.type, Equals(VariableTypeNamed));
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight},
+        {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
+      }),
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
+      })
+    })))
+  });
+
+  it("uses the last assigned precedence", [&]() {
+    SyntaxVariable result = flatten_rule(Variable{
+      "test1",
+      VariableTypeNamed,
+      prec_left(101, seq({
+        i_sym(1),
+        i_sym(2),
+      }))
+    });
+
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+      })
+    })))
+
+    result = flatten_rule(Variable{
+      "test2",
+      VariableTypeNamed,
+      prec_left(101, seq({
+        i_sym(1),
+      }))
+    });
+
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+      })
+    })))
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/intern_symbols_test.cc
+++ b/test/compiler/prepare_grammar/intern_symbols_test.cc
@ -0,0 +1,103 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/intern_symbols.h"
+#include "compiler/grammar.h"
+#include "compiler/rules/named_symbol.h"
+#include "compiler/rules/symbol.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::intern_symbols;
+
+describe("intern_symbols", []() {
+  it("replaces named symbols with numerically-indexed symbols", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("_z") })},
+        {"y", sym("_z")},
+        {"_z", str("stuff")}
+      }, {}, {}, {}
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.second, Equals(CompileError::none()));
+    AssertThat(result.first.variables, Equals(vector<Variable>{
+      Variable{"x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })},
+      Variable{"y", VariableTypeNamed, i_sym(2)},
+      Variable{"_z", VariableTypeHidden, str("stuff")},
+    }));
+  });
+
+  describe("when there are symbols that reference undefined rules", [&]() {
+    it("returns an error", []() {
+      Grammar grammar{
+        {
+          {"x", sym("y")},
+        },
+        {}, {}, {}
+      };
+
+      auto result = intern_symbols(grammar);
+
+      AssertThat(result.second.message, Equals("Undefined rule 'y'"));
+    });
+  });
+
+  it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")}
+      },
+      {
+        sym("z")
+      },
+      {}, {}
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.second, Equals(CompileError::none()));
+    AssertThat(result.first.extra_tokens.size(), Equals<size_t>(1));
+    AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2)));
+  });
+
+  it("records any rule names that match external token names", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")},
+      },
+      {},
+      {},
+      {
+        "w",
+        "z"
+      }
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>{
+      ExternalToken{
+        "w",
+        VariableTypeNamed,
+        rules::NONE()
+      },
+      ExternalToken{
+        "z",
+        VariableTypeNamed,
+        Symbol(2, Symbol::NonTerminal)
+      },
+    }))
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/parse_regex_test.cc
+++ b/test/compiler/prepare_grammar/parse_regex_test.cc
@ -0,0 +1,245 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/parse_regex.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::parse_regex;
+
+describe("parse_regex", []() {
+  struct ValidInputRow {
+    string description;
+    string pattern;
+    rule_ptr rule;
+  };
+
+  vector<ValidInputRow> valid_inputs = {
+    {
+      "character sets",
+      "[aAeE]",
+      character({ 'a', 'A', 'e', 'E' })
+    },
+
+    {
+      "'.' characters as wildcards",
+      ".",
+      character({ '\n' }, false)
+    },
+
+    {
+      "character classes",
+      "\\w-\\d-\\s-\\W-\\D-\\S",
+      seq({
+        character({
+          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_' }),
+        character({ '-' }),
+        character({ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }),
+        character({ '-' }),
+        character({ ' ', '\t', '\r', '\n' }),
+        character({ '-' }),
+        character({
+          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_' }, false),
+        character({ '-' }),
+        character({ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, false),
+        character({ '-' }),
+        character({ ' ', '\t', '\r', '\n' }, false),
+      })
+    },
+
+    {
+      "choices",
+      "ab|cd|ef",
+      choice({
+        seq({
+          character({ 'a' }),
+          character({ 'b' }) }),
+        seq({
+          character({ 'c' }),
+          character({ 'd' }) }),
+        seq({
+          character({ 'e' }),
+          character({ 'f' }) }) })
+    },
+
+    {
+      "simple sequences",
+      "abc",
+      seq({
+        character({ 'a' }),
+        character({ 'b' }),
+        character({ 'c' }) })
+    },
+
+    {
+      "character ranges",
+      "[12a-dA-D3]",
+      character({
+        '1', '2', '3',
+        'a', 'b', 'c', 'd',
+        'A', 'B', 'C', 'D' })
+    },
+
+    {
+      "negated characters",
+      "[^a\\d]",
+      character({ 'a', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, false)
+    },
+
+    {
+      "backslashes",
+      "\\\\",
+      character({ '\\' })
+    },
+
+    {
+      "character groups in sequences",
+      "x([^x]|\\\\x)*x",
+      seq({
+        character({ 'x' }),
+        repeat(choice({
+          character({ 'x' }, false),
+          seq({ character({ '\\' }), character({ 'x' }) }) })),
+        character({ 'x' }) })
+    },
+
+    {
+      "choices in sequences",
+      "(a|b)cd",
+      seq({
+        choice({
+          character({ 'a' }),
+          character({ 'b' }) }),
+        character({ 'c' }),
+        character({ 'd' }) })
+    },
+
+    {
+      "escaped parentheses",
+      "a\\(b",
+      seq({
+        character({ 'a' }),
+        character({ '(' }),
+        character({ 'b' }) })
+    },
+
+    {
+      "escaped periods",
+      "a\\.",
+      seq({
+        character({ 'a' }),
+        character({ '.' }) })
+    },
+
+    {
+      "escaped characters",
+      "\\t\\n\\r",
+      seq({
+        character({ '\t' }),
+        character({ '\n' }),
+        character({ '\r' }) })
+    },
+
+    {
+      "plus repeats",
+      "(ab)+(cd)+",
+      seq({
+        repeat1(seq({ character({ 'a' }), character({ 'b' }) })),
+        repeat1(seq({ character({ 'c' }), character({ 'd' }) })) })
+    },
+
+    {
+      "asterix repeats",
+      "(ab)*(cd)*",
+      seq({
+        repeat(seq({ character({ 'a' }), character({ 'b' }) })),
+        repeat(seq({ character({ 'c' }), character({ 'd' }) })) })
+    },
+
+    {
+      "optional rules",
+      "a(bc)?",
+      seq({
+        character({ 'a' }),
+        choice({
+          seq({ character({ 'b' }), character({ 'c' }) }),
+          blank() }) })
+    },
+
+    {
+      "choices containing negated character classes",
+      "/([^/]|(\\\\/))*/",
+      seq({
+        character({ '/' }),
+        repeat(choice({
+          character({ '/' }, false),
+          seq({ character({ '\\' }), character({ '/' }) }) })),
+        character({ '/' }), }),
+    },
+  };
+
+  struct InvalidInputRow {
+    string description;
+    string pattern;
+    const char *message;
+  };
+
+  vector<InvalidInputRow> invalid_inputs = {
+    {
+      "mismatched open parens",
+      "(a",
+      "unmatched open paren",
+    },
+    {
+      "mismatched nested open parens",
+      "((a) (b)",
+      "unmatched open paren",
+    },
+    {
+      "mismatched close parens",
+      "a)",
+      "unmatched close paren",
+    },
+    {
+      "mismatched nested close parens",
+      "((a) b))",
+      "unmatched close paren",
+    },
+    {
+      "mismatched brackets for character classes",
+      "[a",
+      "unmatched open square bracket",
+    },
+    {
+      "mismatched brackets for character classes",
+      "a]",
+      "unmatched close square bracket",
+    },
+  };
+
+  for (auto &row : valid_inputs) {
+    it(("parses " + row.description).c_str(), [&]() {
+      auto result = parse_regex(row.pattern);
+      AssertThat(result.first, EqualsPointer(row.rule));
+    });
+  }
+
+  for (auto &row : invalid_inputs) {
+    it(("handles invalid regexes with " + row.description).c_str(), [&]() {
+      auto result = parse_regex(row.pattern);
+      AssertThat(result.second.type, Equals(TSCompileErrorTypeInvalidRegex));
+      AssertThat(result.second.message, Contains(row.message));
+    });
+  }
+});
+
+END_TEST
--- a/test/compiler/rules/character_set_test.cc
+++ b/test/compiler/rules/character_set_test.cc
@ -0,0 +1,337 @@
+#include "test_helper.h"
+#include "compiler/rules/character_set.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("CharacterSet", []() {
+  describe("equality", [&]() {
+    it("returns true for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      AssertThat(set1, Equals(set2));
+    });
+
+    it("returns false for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'c')
+        .include('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include_all()
+        .exclude('a', 'd')
+        .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include_all()
+        .exclude('a', 'c')
+        .exclude('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+  });
+
+  describe("hashing", [&]() {
+    it("returns the same number for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      AssertThat(set1.hash_code(), Equals(set2.hash_code()));
+    });
+
+    it("returns different numbers for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'c')
+        .include('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+
+    it("returns different numbers for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include_all()
+        .exclude('a', 'd')
+        .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include_all()
+        .exclude('a', 'c')
+        .exclude('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+
+    it("returns different numbers for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+  });
+
+  describe("::is_empty", [&]() {
+    it("returns true for empty character sets", [&]() {
+      AssertThat(CharacterSet().is_empty(), Equals(true));
+    });
+
+    it("returns false for full character sets", [&]() {
+      AssertThat(CharacterSet().include_all().is_empty(), Equals(false));
+    });
+
+    it("returns false for character sets that include some characters", [&]() {
+      AssertThat(CharacterSet().include('x').is_empty(), Equals(false));
+    });
+  });
+
+  describe("::include", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("adds included characters", [&]() {
+        CharacterSet set1 = CharacterSet().include('a', 'd');
+        AssertThat(set1, Equals(CharacterSet()
+          .include('a')
+          .include('b')
+          .include('c')
+          .include('d')));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'g')
+          .include('c', 'e');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include_all()
+          .exclude('a')
+          .exclude('b')
+          .exclude('f')
+          .exclude('g')));
+      });
+
+      it("does nothing if the character are already not excluded", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .include('a', 'c');
+
+        AssertThat(set1, Equals(CharacterSet().include_all()));
+      });
+    });
+  });
+
+  describe("::exclude", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("removes included characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include('a', 'g')
+          .exclude('c', 'e');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include('a')
+          .include('b')
+          .include('f')
+          .include('g')));
+      });
+
+      it("does nothing if the character's are already not included", [&]() {
+        CharacterSet set1 = CharacterSet().exclude('a', 'c');
+        AssertThat(set1, Equals(CharacterSet()));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'd');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include_all()
+          .exclude('a')
+          .exclude('b')
+          .exclude('c')
+          .exclude('d')));
+      });
+    });
+  });
+
+  describe("::remove_set", []() {
+    CharacterSet intersection;
+
+    describe("for a set with whitelisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("removes the included characters that the other set also includes", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(set1, Equals(CharacterSet()
+            .include('a', 'c')
+            .include('t', 'z')));
+        });
+
+        it("returns the characters that were removed", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(intersection, Equals(CharacterSet()
+            .include('d', 's')));
+        });
+
+        it("returns the empty set when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('A', 'Z'));
+          AssertThat(set1, Equals(CharacterSet().include('a', 'z')));
+          AssertThat(intersection, Equals(CharacterSet()));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("removes the included characters that are not excluded by the other set", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('d', 'f')));
+          AssertThat(intersection, Equals(CharacterSet()
+            .include('a', 'c')));
+        });
+      });
+    });
+
+    describe("for a set with blacklisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("adds the subtractend's inclusions to the receiver's exclusions", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include('x', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'f')
+            .exclude('x', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet().include('x', 'z')));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("includes only the characters excluded by the subtractend but not by the receiver", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('n', 'z')));
+        });
+
+        it("returns the characters excluded by neither set", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(intersection, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'z')));
+        });
+
+        it("works when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('n', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'z')));
+        });
+      });
+    });
+  });
+
+  describe("::included_ranges", [&]() {
+    it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'c')
+        .include('g')
+        .include('z');
+
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+        CharacterRange('a', 'c'),
+        CharacterRange('g'),
+        CharacterRange('z'),
+      })));
+    });
+
+    it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'b')
+        .include('g')
+        .include('z');
+
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+        CharacterRange('a'),
+        CharacterRange('b'),
+        CharacterRange('g'),
+        CharacterRange('z'),
+      })));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/rules/choice_test.cc
+++ b/test/compiler/rules/choice_test.cc
@ -0,0 +1,53 @@
+#include "test_helper.h"
+#include "compiler/rules/choice.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/equals_pointer.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("Choice", []() {
+  describe("constructing choices", [&]() {
+    it("eliminates duplicate members", [&]() {
+      auto rule = Choice::build({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+        seq({ sym("one"), sym("two") })
+      });
+
+      AssertThat(rule, EqualsPointer(choice({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+      })));
+    });
+
+    it("eliminates duplicates within nested choices", [&]() {
+      auto rule = Choice::build({
+        seq({ sym("one"), sym("two") }),
+        Choice::build({
+          sym("three"),
+          seq({ sym("one"), sym("two") })
+        })
+      });
+
+      AssertThat(rule, EqualsPointer(choice({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+      })));
+    });
+
+    it("doesn't construct a choice if there's only one unique member", [&]() {
+      auto rule = Choice::build({
+        sym("one"),
+        Choice::build({
+          sym("one"),
+        })
+      });
+
+      AssertThat(rule, EqualsPointer(sym("one")));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/rules/repeat_test.cc
+++ b/test/compiler/rules/repeat_test.cc
@ -0,0 +1,22 @@
+#include "test_helper.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/symbol.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("Repeat", []() {
+  describe("constructing repeats", [&]() {
+    it("doesn't create redundant repeats", [&]() {
+      auto sym = make_shared<Symbol>(1, Symbol::NonTerminal);
+      auto repeat = Repeat::build(sym);
+      auto outer_repeat = Repeat::build(repeat);
+
+      AssertThat(repeat, !Equals(sym));
+      AssertThat(outer_repeat, Equals(repeat));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/util/string_helpers_test.cc
+++ b/test/compiler/util/string_helpers_test.cc
@ -0,0 +1,26 @@
+#include "test_helper.h"
+#include "compiler/util/string_helpers.h"
+
+using util::escape_char;
+
+START_TEST
+
+describe("escape_char", []() {
+  it("returns ascii characters as strings", [&]() {
+    AssertThat(escape_char('x'), Equals("'x'"));
+  });
+
+  it("escapes special characters with backslashes", [&]() {
+    AssertThat(escape_char('\\'), Equals("'\\\\'"));
+    AssertThat(escape_char('\n'), Equals("'\\n'"));
+    AssertThat(escape_char('\t'), Equals("'\\t'"));
+    AssertThat(escape_char('\r'), Equals("'\\r'"));
+    AssertThat(escape_char('\''), Equals("'\\''"));
+  });
+
+  it("prints non-ascii characters as numbers", [&]() {
+    AssertThat(escape_char(256), Equals("256"));
+  });
+});
+
+END_TEST
--- a/test/fixtures/error_corpus/c_errors.txt
+++ b/test/fixtures/error_corpus/c_errors.txt
@ -0,0 +1,130 @@
+========================================
+Errors inside ifdefs
+========================================
+
+#ifdef something
+int x // no semicolon
+#endif
+
+int a;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int b;
+
+#ifdef __cplusplus
+}
+#endif
+
+int c;
+
+---
+
+(translation_unit
+  (preproc_ifdef (identifier)
+    (ERROR (identifier) (identifier))
+    (comment))
+
+  (declaration (identifier) (identifier))
+
+  (preproc_ifdef (identifier)
+    (ERROR (storage_class_specifier) (string_literal)))
+
+  (declaration (identifier) (identifier))
+
+  (preproc_ifdef (identifier)
+    (ERROR))
+
+  (declaration (identifier) (identifier)))
+
+========================================
+Errors inside blocks
+========================================
+
+int main() {
+  int x;
+  int %$#@
+}
+
+---
+
+(translation_unit
+  (function_definition
+    (identifier)
+    (function_declarator (identifier))
+    (compound_statement
+      (declaration (identifier) (identifier))
+      (ERROR (identifier) (UNEXPECTED '$')))))
+
+========================================
+Errors inside expressions
+========================================
+
+int main() {
+  int x = (123 123);
+}
+
+---
+
+(translation_unit
+  (function_definition
+    (identifier)
+    (function_declarator (identifier))
+    (compound_statement
+      (declaration (identifier) (init_declarator
+        (identifier)
+        (ERROR (number_literal))
+        (number_literal))))))
+
+========================================
+Errors in declarations
+========================================
+
+float x WTF;
+int y = 5;
+
+---
+
+(translation_unit
+  (declaration (identifier) (ERROR (identifier)) (identifier))
+  (declaration (identifier) (init_declarator (identifier) (number_literal))))
+
+==========================================
+Errors at the beginnings of blocks
+==========================================
+
+int a() {
+  struct x = 1;
+  struct y = 2;
+}
+
+int b() {
+  w x y z = 3;
+  w x y z = 4;
+}
+
+---
+
+(translation_unit
+  (function_definition
+    (identifier) (function_declarator (identifier))
+    (compound_statement
+      (ERROR (struct_specifier (identifier)))
+      (expression_statement (number_literal))
+      (ERROR (struct_specifier (identifier)))
+      (expression_statement (number_literal))))
+
+  (function_definition
+    (identifier) (function_declarator (identifier))
+    (compound_statement
+      (declaration
+        (identifier)
+        (init_declarator
+          (ERROR (identifier) (identifier))
+          (identifier) (number_literal)))
+      (declaration
+        (ERROR (identifier) (identifier))
+        (identifier)
+        (init_declarator (identifier) (number_literal))))))
--- a/test/fixtures/error_corpus/javascript_errors.txt
+++ b/test/fixtures/error_corpus/javascript_errors.txt
@ -0,0 +1,157 @@
+===================================================
+one invalid token right after the viable prefix
+===================================================
+
+if (a b) {
+  c d;
+}
+e f;
+
+---
+
+(program
+  (if_statement
+    (ERROR (identifier))
+    (identifier)
+    (statement_block
+      (ERROR (identifier))
+      (expression_statement (identifier))))
+  (ERROR (identifier))
+  (expression_statement (identifier)))
+
+=======================================================
+multiple invalid tokens right after the viable prefix
+=======================================================
+
+if (a b c) {
+  d e f g;
+}
+h i j k;
+
+---
+
+(program
+  (if_statement
+    (ERROR (identifier) (identifier))
+    (identifier)
+    (statement_block
+      (ERROR (identifier) (identifier) (identifier))
+      (expression_statement (identifier))))
+  (expression_statement
+    (ERROR (identifier) (identifier) (identifier))
+    (identifier)))
+
+===================================================
+one invalid subtree right after the viable prefix
+===================================================
+
+if ({a: 'b'} {c: 'd'}) {
+  x = function(a) { b; } function(c) { d; }
+}
+
+---
+
+(program
+  (if_statement
+    (object (pair (identifier) (string)))
+    (ERROR (object (pair (identifier) (string))))
+    (statement_block
+      (expression_statement (assignment
+        (identifier)
+        (ERROR (function
+          (formal_parameters (identifier))
+          (statement_block (expression_statement (identifier)))))
+        (function
+          (formal_parameters (identifier))
+          (statement_block (expression_statement (identifier)))))))))
+
+===================================================
+one invalid token at the end of the file
+===================================================
+
+// skip the equals sign
+a.b =
+---
+
+(program
+  (comment)
+  (trailing_expression_statement
+    (member_access (identifier) (identifier)))
+  (ERROR))
+
+=================================================================
+An invalid token at the end of a construct with extra line breaks
+=================================================================
+
+a(
+  b,
+  c,,
+);
+
+---
+
+(program
+  (expression_statement
+    (function_call (identifier) (arguments
+      (identifier)
+      (identifier)
+      (ERROR)))))
+
+===================================================
+Multi-line chained expressions in var declarations
+===================================================
+
+const one = two
+  .three(four)
+  .five()
+
+---
+
+(program
+  (var_declaration (var_assignment
+    (identifier)
+    (function_call
+      (member_access
+        (function_call
+          (member_access (identifier) (identifier))
+          (arguments (identifier)))
+        (identifier))
+      (arguments)))))
+
+===================================================
+Errors after a sequence of function declarations
+===================================================
+
+/*
+ * The JS grammar has an ambiguity such that these functions
+ * can be parsed either as function declarations or as
+ * function expressions. This ambiguity causes a lot of
+ * splitting and merging in the parse stack. When iterating
+ * the parse stack during an error repair, there would then
+ * be a very large number (> 2^16) of paths through the parse
+ * stack.
+ */
+function a() {}
+function b() {}
+function c() {}
+function e() {}
+function f() {}
+function g() {}
+function h() {}
+function i() {}
+
+var x = !!!
+
+---
+
+(program
+  (comment)
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (expression_statement (function (identifier) (formal_parameters) (statement_block)))
+  (trailing_var_declaration (identifier)) (ERROR))
--- a/test/fixtures/error_corpus/json_errors.txt
+++ b/test/fixtures/error_corpus/json_errors.txt
@ -0,0 +1,56 @@
+==========================================
+top-level errors
+==========================================
+
+[}
+
+---
+
+(ERROR)
+
+==========================================
+unexpected tokens
+==========================================
+
+barf
+
+---
+
+(ERROR (UNEXPECTED 'b'))
+
+==========================================
+errors inside arrays
+==========================================
+
+[1, , 2]
+
+---
+(array
+  (number)
+  (ERROR)
+  (number))
+
+==========================================
+errors inside objects
+==========================================
+
+{ "key1": 1, oops }
+
+---
+
+(object (pair (string) (number)) (ERROR (UNEXPECTED 'o')))
+
+==========================================
+errors inside nested objects
+==========================================
+
+{ "key1": { "key2": 1, 2 }, [, "key3": 3 }
+
+---
+
+(object
+  (pair (string) (object
+    (pair (string) (number))
+    (ERROR (number))))
+  (ERROR)
+  (pair (string) (number)))
--- a/test/fixtures/error_corpus/python_errors.txt
+++ b/test/fixtures/error_corpus/python_errors.txt
@ -0,0 +1,29 @@
+==========================================
+errors in if statements
+==========================================
+
+if a is:
+    print b
+    print c
+
+---
+
+(module
+  (if_statement (identifier) (ERROR)
+    (print_statement (identifier))
+    (print_statement (identifier))))
+
+==========================================
+errors in function definitions
+==========================================
+
+def a()::
+  b
+  c
+
+---
+
+(module
+  (function_definition (identifier) (parameters) (ERROR)
+    (expression_statement (identifier))
+    (expression_statement (identifier))))
--- a/test/fixtures/grammars/.gitkeep
+++ b/test/fixtures/grammars/.gitkeep
--- a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt
+++ b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt
@ -0,0 +1,32 @@
+================================================
+anonymous tokens defined with character classes
+================================================
+1234
+---
+
+(first_rule)
+
+=================================================
+anonymous tokens defined with LF escape sequence
+=================================================
+
+
+---
+
+(first_rule)
+
+=================================================
+anonymous tokens defined with CR escape sequence
+=================================================
+
+---
+
+(first_rule)
+
+================================================
+anonymous tokens with quotes
+================================================
+'hello'
+---
+
+(first_rule)
--- a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json
+++ b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json
@ -0,0 +1,14 @@
+{
+  "name": "anonymous_tokens_with_escaped_chars",
+  "rules": {
+    "first_rule": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "STRING", "value": "\n"},
+        {"type": "STRING", "value": "\r"},
+        {"type": "STRING", "value": "'hello'"},
+        {"type": "PATTERN", "value": "\\d+"}
+      ]
+    }
+  }
+}
--- a/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/readme.md
+++ b/test/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/readme.md
@ -0,0 +1 @@
+Every token in a grammar is given a name in the generated parser. Anonymous tokens (tokens specified directly in the body of some larger rule) are named according their content. So when tokens contains characters that aren't valid in a C string literal, we need to escape those characters. This grammar tests that this escaping works. The test is basically that the generated parser compiles succesfully.
--- a/test/fixtures/test_grammars/associativity_left/corpus.txt
+++ b/test/fixtures/test_grammars/associativity_left/corpus.txt
@ -0,0 +1,8 @@
+===================
+chained operations
+===================
+x+y+z
+---
+(expression (math_operation
+  (expression (math_operation (expression (identifier)) (expression (identifier))))
+  (expression (identifier))))
--- a/test/fixtures/test_grammars/associativity_left/grammar.json
+++ b/test/fixtures/test_grammars/associativity_left/grammar.json
@ -0,0 +1,31 @@
+{
+  "name": "associativity_left",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/associativity_missing/expected_error.txt
+++ b/test/fixtures/test_grammars/associativity_missing/expected_error.txt
@ -0,0 +1,13 @@
+Unresolved conflict for symbol sequence:
+
+  expression  '+'  expression  •  '+'  …
+
+Possible interpretations:
+
+  1:  (math_operation  expression  '+'  expression)  •  '+'  …
+  2:  expression  '+'  (math_operation  expression  •  '+'  expression)
+
+Possible resolutions:
+
+  1:  Specify a left or right associativity in `math_operation`
+  2:  Add a conflict for these rules: `math_operation`
--- a/test/fixtures/test_grammars/associativity_missing/grammar.json
+++ b/test/fixtures/test_grammars/associativity_missing/grammar.json
@ -0,0 +1,27 @@
+{
+  "name": "associativity_missing",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "+"},
+        {"type": "SYMBOL", "name": "expression"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/associativity_right/corpus.txt
+++ b/test/fixtures/test_grammars/associativity_right/corpus.txt
@ -0,0 +1,8 @@
+===================
+chained operations
+===================
+x+y+z
+---
+(expression (math_operation
+  (expression (identifier))
+  (expression (math_operation (expression (identifier)) (expression (identifier))))))
--- a/test/fixtures/test_grammars/associativity_right/grammar.json
+++ b/test/fixtures/test_grammars/associativity_right/grammar.json
@ -0,0 +1,31 @@
+{
+  "name": "associativity_right",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "math_operation"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "math_operation": {
+      "type": "PREC_RIGHT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt
+++ b/test/fixtures/test_grammars/conflicting_precedence/expected_error.txt
@ -0,0 +1,15 @@
+Unresolved conflict for symbol sequence:
+
+  expression  '+'  expression  •  '*'  …
+
+Possible interpretations:
+
+  1:  (sum  expression  '+'  expression)  •  '*'  …
+  2:  expression  '+'  (product  expression  •  '*'  expression)
+  3:  expression  '+'  (other_thing  expression  •  '*'  '*')
+
+Possible resolutions:
+
+  1:  Specify a higher precedence in `product` and `other_thing` than in the other rules.
+  2:  Specify a higher precedence in `sum` than in the other rules.
+  3:  Add a conflict for these rules: `sum` `product` `other_thing`
--- a/test/fixtures/test_grammars/conflicting_precedence/grammar.json
+++ b/test/fixtures/test_grammars/conflicting_precedence/grammar.json
@ -0,0 +1,58 @@
+{
+  "name": "conflicting_precedence",
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "product"},
+        {"type": "SYMBOL", "name": "other_thing"}
+      ]
+    },
+
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "product": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "other_thing": {
+      "type": "PREC_LEFT",
+      "value": -1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "STRING", "value": "*"}
+        ]
+      }
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/epsilon_rules/expected_error.txt
+++ b/test/fixtures/test_grammars/epsilon_rules/expected_error.txt
@ -0,0 +1,2 @@
+The rule `rule_2` matches the empty string.
+Tree-sitter currently does not support syntactic rules that match the empty string.
--- a/test/fixtures/test_grammars/epsilon_rules/grammar.json
+++ b/test/fixtures/test_grammars/epsilon_rules/grammar.json
@ -0,0 +1,15 @@
+{
+  "name": "epsilon_rules",
+
+  "rules": {
+    "rule_1": {"type": "SYMBOL", "name": "rule_2"},
+
+    "rule_2": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "rule_1"},
+        {"type": "BLANK"}
+      ]
+    }
+  }
+}
--- a/test/fixtures/test_grammars/external_and_internal_tokens/corpus.txt
+++ b/test/fixtures/test_grammars/external_and_internal_tokens/corpus.txt
@ -0,0 +1,41 @@
+=========================================
+single-line statements - internal tokens
+=========================================
+
+a b
+
+---
+
+(statement (variable) (variable) (line_break))
+
+=========================================
+multi-line statements - internal tokens
+=========================================
+
+a
+b
+
+---
+
+(statement (variable) (variable) (line_break))
+
+=========================================
+single-line statements - external tokens
+=========================================
+
+'hello' 'world'
+
+---
+
+(statement (string) (string) (line_break))
+
+=========================================
+multi-line statements - external tokens
+=========================================
+
+'hello'
+'world'
+
+---
+
+(statement (string) (string) (line_break))
--- a/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json
+++ b/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json
@ -0,0 +1,36 @@
+{
+  "name": "external_and_internal_tokens",
+
+  "externals": [
+    "string",
+    "line_break"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "_expression"},
+        {"type": "SYMBOL", "name": "_expression"},
+        {"type": "SYMBOL", "name": "line_break"}
+      ]
+    },
+
+    "_expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "string"},
+        {"type": "SYMBOL", "name": "variable"},
+        {"type": "SYMBOL", "name": "number"}
+      ]
+    },
+
+    "variable": {"type": "PATTERN", "value": "\\a+"},
+    "number": {"type": "PATTERN", "value": "\\d+"},
+    "line_break": {"type": "STRING", "value": "\n"}
+  }
+}
--- a/test/fixtures/test_grammars/external_and_internal_tokens/readme.md
+++ b/test/fixtures/test_grammars/external_and_internal_tokens/readme.md
@ -0,0 +1 @@
+This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an *internal* token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field.
--- a/test/fixtures/test_grammars/external_and_internal_tokens/scanner.c
+++ b/test/fixtures/test_grammars/external_and_internal_tokens/scanner.c
@ -0,0 +1,55 @@
+#include <tree_sitter/parser.h>
+
+enum {
+  STRING,
+  LINE_BREAK
+};
+
+void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; }
+
+void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {}
+
+void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {}
+
+bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
+
+void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_and_internal_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+
+  // If a line-break is a valid lookahead token, only skip spaces.
+  if (whitelist[LINE_BREAK]) {
+    while (lexer->lookahead == ' ') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead == '\n') {
+      lexer->advance(lexer, false);
+      lexer->result_symbol = LINE_BREAK;
+      return true;
+    }
+  }
+
+  // If a line-break is not a valid lookahead token, skip line breaks as well
+  // as spaces.
+  if (whitelist[STRING]) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\n') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead == '\'') {
+      lexer->advance(lexer, false);
+
+      while (lexer->lookahead != '\'') {
+        lexer->advance(lexer, false);
+      }
+
+      lexer->advance(lexer, false);
+      lexer->result_symbol = STRING;
+      return true;
+    }
+  }
+
+  return false;
+}
--- a/test/fixtures/test_grammars/external_extra_tokens/corpus.txt
+++ b/test/fixtures/test_grammars/external_extra_tokens/corpus.txt
@ -0,0 +1,10 @@
+========================
+extra external tokens
+========================
+
+x = # a comment
+y
+
+---
+
+(assignment (variable) (comment) (variable))
--- a/test/fixtures/test_grammars/external_extra_tokens/grammar.json
+++ b/test/fixtures/test_grammars/external_extra_tokens/grammar.json
@ -0,0 +1,25 @@
+{
+  "name": "external_extra_tokens",
+
+  "externals": [
+    "comment"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"},
+    {"type": "SYMBOL", "name": "comment"}
+  ],
+
+  "rules": {
+    "assignment": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "variable"},
+        {"type": "STRING", "value": "="},
+        {"type": "SYMBOL", "name": "variable"}
+      ]
+    },
+
+    "variable": {"type": "PATTERN", "value": "\\a+"}
+  }
+}
--- a/test/fixtures/test_grammars/external_extra_tokens/scanner.c
+++ b/test/fixtures/test_grammars/external_extra_tokens/scanner.c
@ -0,0 +1,36 @@
+#include <tree_sitter/parser.h>
+
+enum {
+  COMMENT,
+};
+
+void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; }
+
+void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {}
+
+void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {}
+
+bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
+
+void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_extra_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+
+  while (lexer->lookahead == ' ') {
+    lexer->advance(lexer, true);
+  }
+
+  if (lexer->lookahead == '#') {
+    lexer->advance(lexer, false);
+    while (lexer->lookahead != '\n') {
+      lexer->advance(lexer, false);
+    }
+
+    lexer->result_symbol = COMMENT;
+    return true;
+  }
+
+  return false;
+}
+
--- a/test/fixtures/test_grammars/external_tokens/corpus.txt
+++ b/test/fixtures/test_grammars/external_tokens/corpus.txt
@ -0,0 +1,22 @@
+========================
+simple external tokens
+=========================
+
+x + %(sup (external) scanner?)
+
+---
+
+(expression (sum (expression (identifier)) (expression (string))))
+
+==================================
+external tokens that require state
+==================================
+
+%{sup {} #{x + y} {} scanner?}
+
+---
+
+(expression (string
+  (expression (sum
+    (expression (identifier))
+    (expression (identifier))))))
--- a/test/fixtures/test_grammars/external_tokens/grammar.json
+++ b/test/fixtures/test_grammars/external_tokens/grammar.json
@ -0,0 +1,57 @@
+{
+  "name": "external_tokens",
+
+  "externals": [
+    "_percent_string",
+    "_percent_string_start",
+    "_percent_string_end"
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "string"},
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "string": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "_percent_string"},
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "_percent_string_start"},
+            {"type": "SYMBOL", "name": "expression"},
+            {"type": "SYMBOL", "name": "_percent_string_end"}
+          ]
+        },
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "\\a+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/external_tokens/scanner.c
+++ b/test/fixtures/test_grammars/external_tokens/scanner.c
@ -0,0 +1,114 @@
+#include <tree_sitter/parser.h>
+
+enum {
+  percent_string,
+  percent_string_start,
+  percent_string_end
+};
+
+typedef struct {
+  int32_t open_delimiter;
+  int32_t close_delimiter;
+  uint32_t depth;
+} Scanner;
+
+void *tree_sitter_external_tokens_external_scanner_create() {
+  Scanner *scanner = malloc(sizeof(Scanner));
+  *scanner = (Scanner){
+    .open_delimiter = 0,
+    .close_delimiter = 0,
+    .depth = 0
+  };
+  return scanner;
+}
+
+void tree_sitter_external_tokens_external_scanner_destroy(void *payload) {
+  free(payload);
+}
+
+void tree_sitter_external_tokens_external_scanner_reset(void *payload) {}
+
+bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
+
+void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
+
+bool tree_sitter_external_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+  Scanner *scanner = payload;
+
+  if (whitelist[percent_string]) {
+    while (lexer->lookahead == ' ' ||
+           lexer->lookahead == '\t' ||
+           lexer->lookahead == '\n') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead != '%') return false;
+    lexer->advance(lexer, false);
+
+    switch (lexer->lookahead) {
+      case '(':
+        scanner->open_delimiter = '(';
+        scanner->close_delimiter = ')';
+        scanner->depth = 1;
+        break;
+      case '[':
+        scanner->open_delimiter = '[';
+        scanner->close_delimiter = ']';
+        scanner->depth = 1;
+        break;
+      case '{':
+        scanner->open_delimiter = '{';
+        scanner->close_delimiter = '}';
+        scanner->depth = 1;
+        break;
+      default:
+        return false;
+    }
+
+    lexer->advance(lexer, false);
+
+    for (;;) {
+      if (scanner->depth == 0) {
+        lexer->result_symbol = percent_string;
+        return true;
+      }
+
+      if (lexer->lookahead == scanner->open_delimiter) {
+        scanner->depth++;
+      } else if (lexer->lookahead == scanner->close_delimiter) {
+        scanner->depth--;
+      } else if (lexer->lookahead == '#') {
+        lexer->advance(lexer, false);
+        if (lexer->lookahead == '{') {
+          lexer->advance(lexer, false);
+          lexer->result_symbol = percent_string_start;
+          return true;
+        }
+      }
+
+      lexer->advance(lexer, false);
+    }
+  } else if (whitelist[percent_string_end]) {
+    if (lexer->lookahead != '}') return false;
+    lexer->advance(lexer, false);
+
+    for (;;) {
+      if (scanner->depth == 0) {
+        lexer->result_symbol = percent_string_end;
+        return true;
+      }
+
+      if (lexer->lookahead == scanner->open_delimiter) {
+        scanner->depth++;
+      } else if (lexer->lookahead == scanner->close_delimiter) {
+        scanner->depth--;
+      }
+
+      lexer->advance(lexer, false);
+    }
+  }
+
+  return false;
+}
+
--- a/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
+++ b/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
@ -0,0 +1,33 @@
+========================
+regexes
+========================
+
+/a+/
+
+---
+
+(expression (regex))
+
+========================
+conditionals
+========================
+
+(if (1) /a+/)
+
+---
+
+(expression (parenthesized (expression (conditional
+  (parenthesized (expression (number)))
+  (expression (regex))))))
+
+========================
+quotients
+========================
+
+((1) / 2)
+
+---
+
+(expression (parenthesized (expression (quotient
+  (expression (parenthesized (expression (number))))
+  (expression (number))))))
--- a/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
+++ b/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
@ -0,0 +1,65 @@
+{
+  "name": "lexical_conflicts_due_to_state_merging",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "conditional"},
+        {"type": "SYMBOL", "name": "regex"},
+        {"type": "SYMBOL", "name": "quotient"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "parenthesized"}
+      ]
+    },
+
+    "conditional": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "STRING", "value": "if"},
+          {"type": "SYMBOL", "name": "parenthesized"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "quotient": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "/"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "regex": {
+      "type": "PATTERN",
+      "value": "/[^/\n]+/"
+    },
+
+    "number": {
+      "type": "PATTERN",
+      "value": "\\d+"
+    },
+
+    "parenthesized": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "("},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": ")"}
+      ]
+    }
+  }
+}
--- a/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
+++ b/test/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
@ -0,0 +1,20 @@
+This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem.
+
+When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state.
+
+If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state:
+
+```
+(if (1) /\w+/)
+       ^
+```
+
+and a `/` is valid in this state:
+
+
+```
+((1) / 2)
+    ^
+```
+
+And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`.
--- a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt
+++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt
@ -0,0 +1,15 @@
+Unresolved conflict for symbol sequence:
+
+  identifier  •  '{'  …
+
+Possible interpretations:
+
+  1:  (expression  identifier)  •  '{'  …
+  2:  (function_call  identifier  •  block)
+
+Possible resolutions:
+
+  1:  Specify a higher precedence in `function_call` than in the other rules.
+  2:  Specify a higher precedence in `expression` than in the other rules.
+  3:  Specify a left or right associativity in `expression`
+  4:  Add a conflict for these rules: `expression` `function_call`
--- a/test/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json
+++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_missing",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": 0,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/precedence_on_single_child_missing/readme.md
+++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/readme.md
@ -0,0 +1,15 @@
+This language has function calls similar to Ruby's, with no parentheses required, and optional blocks.
+
+There is a shift/reduce conflict here:
+
+```
+foo bar { baz }
+       ^
+```
+
+The possible actions are:
+
+1. `reduce(expression, 1)` - `bar` is an expression being passed to the `foo` function.
+2. `shift` - `bar` is a function being called with the block `{ baz }`
+
+The grammars `precedence_on_single_child_negative` and `precedence_on_single_child_positive` show possible resolutions to this conflict.
--- a/test/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt
+++ b/test/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt
@ -0,0 +1,12 @@
+===========================
+function calls with blocks
+===========================
+
+foo bar { baz }
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (identifier))
+  (block (expression (identifier)))))
--- a/test/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json
+++ b/test/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_negative",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": -1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/precedence_on_single_child_negative/readme.md
+++ b/test/fixtures/test_grammars/precedence_on_single_child_negative/readme.md
@ -0,0 +1 @@
+This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`.
--- a/test/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt
+++ b/test/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt
@ -0,0 +1,13 @@
+===========================
+function calls with blocks
+===========================
+
+foo bar { baz }
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (function_call
+    (identifier)
+    (block (expression (identifier)))))))
--- a/test/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json
+++ b/test/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json
@ -0,0 +1,63 @@
+{
+  "name": "precedence_on_single_child_positive",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "function_call"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "function_call": {
+      "type": "PREC_RIGHT",
+      "value": 1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "{"},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": "}"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/precedence_on_single_child_positive/readme.md
+++ b/test/fixtures/test_grammars/precedence_on_single_child_positive/readme.md
@ -0,0 +1 @@
+This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a positive precedence. This causes shifting the `{` token as part of `function_call` to be preferred over reducing the `bar` variable to an expression.
--- a/test/fixtures/test_grammars/precedence_on_subsequence/corpus.txt
+++ b/test/fixtures/test_grammars/precedence_on_subsequence/corpus.txt
@ -0,0 +1,24 @@
+==========================================
+curly brace blocks with high precedence
+==========================================
+
+a b {}
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (function_call (identifier) (block)))))
+
+==========================================
+do blocks with low precedence
+==========================================
+
+a b do end
+
+---
+
+(expression (function_call
+  (identifier)
+  (expression (identifier))
+  (do_block)))
--- a/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json
+++ b/test/fixtures/test_grammars/precedence_on_subsequence/grammar.json
@ -0,0 +1,135 @@
+{
+  "name": "precedence_on_subsequence",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {"type": "SYMBOL", "name": "function_call"},
+          {"type": "SYMBOL", "name": "identifier"},
+          {"type": "SYMBOL", "name": "scope_resolution"}
+        ]
+      }
+    },
+
+    "function_call": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {"type": "SYMBOL", "name": "expression"}
+          ]
+        },
+
+        {
+          "type": "PREC",
+          "value": 1,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "block"}
+            ]
+          }
+        },
+
+        {
+          "type": "PREC",
+          "value": -1,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "identifier"},
+              {"type": "SYMBOL", "name": "do_block"}
+            ]
+          }
+        },
+
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {
+              "type": "PREC",
+              "value": 1,
+              "content": {
+                "type": "SEQ",
+                "members": [
+                  {"type": "SYMBOL", "name": "expression"},
+                  {"type": "SYMBOL", "name": "block"}
+                ]
+              }
+            }
+          ]
+        },
+
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "SYMBOL", "name": "identifier"},
+            {
+              "type": "PREC",
+              "value": -1,
+              "content": {
+                "type": "SEQ",
+                "members": [
+                  {"type": "SYMBOL", "name": "expression"},
+                  {"type": "SYMBOL", "name": "do_block"}
+                ]
+              }
+            }
+          ]
+        }
+      ]
+    },
+
+    "scope_resolution": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "STRING", "value": "::"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          },
+          {
+            "type": "SEQ",
+            "members": [
+              {"type": "STRING", "value": "::"},
+              {"type": "SYMBOL", "name": "expression"},
+            ]
+          }
+        ]
+      }
+    },
+
+    "block": {
+      "type": "STRING",
+      "value": "{}"
+    },
+
+    "do_block": {
+      "type": "STRING",
+      "value": "do end"
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-zA-Z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/readme.md
+++ b/test/fixtures/test_grammars/readme.md
@ -0,0 +1,3 @@
+These small grammars demonstrate specific features or test for certain specific regressions.
+
+For some of them, compilation is expected to fail with a given error message. For others, the resulting parser is expected to produce certain trees.
--- a/test/fixtures/test_grammars/readme_grammar/corpus.txt
+++ b/test/fixtures/test_grammars/readme_grammar/corpus.txt
@ -0,0 +1,13 @@
+==================================
+the readme example
+==================================
+
+a + b * c
+
+---
+
+(expression (sum
+  (expression (variable))
+  (expression (product
+     (expression (variable))
+     (expression (variable))))))
--- a/test/fixtures/test_grammars/readme_grammar/grammar.json
+++ b/test/fixtures/test_grammars/readme_grammar/grammar.json
@ -0,0 +1,67 @@
+{
+  "name": "readme_grammar",
+
+  // Things that can appear anywhere in the language, like comments
+  // and whitespace, are expressed as 'extras'.
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"},
+    {"type": "SYMBOL", "name": "comment"}
+  ],
+
+  "rules": {
+
+    // The first rule listed in the grammar becomes the 'start rule'.
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "product"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "variable"},
+        {
+          "type": "SEQ",
+          "members": [
+            {"type": "STRING", "value": "("},
+            {"type": "SYMBOL", "name": "expression"},
+            {"type": "STRING", "value": ")"}
+          ]
+        }
+      ]
+    },
+
+    // Tokens like '+' and '*' are described directly within the
+    // grammar's rules, as opposed to in a seperate lexer description.
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    // Ambiguities can be resolved at compile time by assigning precedence
+    // values to rule subtrees.
+    "product": {
+      "type": "PREC_LEFT",
+      "value": 2,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    // Tokens can be specified using ECMAScript regexps.
+    "number": {"type": "PATTERN", "value": "\\d+"},
+    "comment": {"type": "PATTERN", "value": "#.*"},
+    "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}
+  }
+}
--- a/test/fixtures/test_grammars/start_rule_is_blank/corpus.txt
+++ b/test/fixtures/test_grammars/start_rule_is_blank/corpus.txt
@ -0,0 +1,7 @@
+========================
+the empty string
+=======================
+
+---
+
+(first_rule)
--- a/test/fixtures/test_grammars/start_rule_is_blank/grammar.json
+++ b/test/fixtures/test_grammars/start_rule_is_blank/grammar.json
@ -0,0 +1,6 @@
+{
+  "name": "start_rule_is_blank",
+  "rules": {
+    "first_rule": {"type": "BLANK"}
+  }
+}
--- a/test/fixtures/test_grammars/start_rule_is_token/corpus.txt
+++ b/test/fixtures/test_grammars/start_rule_is_token/corpus.txt
@ -0,0 +1,6 @@
+===========================
+the single token
+==========================
+the-value
+---
+(first_rule)
--- a/test/fixtures/test_grammars/start_rule_is_token/grammar.json
+++ b/test/fixtures/test_grammars/start_rule_is_token/grammar.json
@ -0,0 +1,6 @@
+{
+  "name": "start_rule_is_token",
+  "rules": {
+    "first_rule": {"type": "STRING", "value": "the-value"}
+  }
+}
--- a/test/helpers/dedent.h
+++ b/test/helpers/dedent.h
@ -0,0 +1,12 @@
+#include "compiler/util/string_helpers.h"
+#include <string>
+
+static std::string dedent(std::string input) {
+  size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n");
+  std::string whitespace = "\n" + std::string(indent_level, ' ');
+  tree_sitter::util::str_replace(&input, whitespace, "\n");
+  return input.substr(
+    input.find_first_not_of("\n "),
+    input.find_last_not_of("\n ") + 1
+  );
+}
--- a/test/helpers/encoding_helpers.cc
+++ b/test/helpers/encoding_helpers.cc
@ -0,0 +1,58 @@
+#include "helpers/encoding_helpers.h"
+#include "runtime/utf16.h"
+#include <assert.h>
+#include "utf8proc.h"
+
+static inline int string_iterate(TSInputEncoding encoding, const uint8_t *string, size_t length, int32_t *code_point) {
+  if (encoding == TSInputEncodingUTF8)
+    return utf8proc_iterate(string, length, code_point);
+  else
+    return utf16_iterate(string, length, code_point);
+}
+
+size_t string_char_count(TSInputEncoding encoding, const std::string &input) {
+  const char *string = input.data();
+  size_t size = input.size();
+  size_t character = 0, byte = 0;
+
+  while (byte < size) {
+    int32_t code_point;
+    byte += string_iterate(encoding, (uint8_t *)string + byte, size - byte, &code_point);
+    character++;
+  }
+
+  return character;
+}
+
+long string_byte_for_character(TSInputEncoding encoding, const std::string &input, size_t byte_offset, size_t goal_character) {
+  const char *string = input.data() + byte_offset;
+  size_t size = input.size() - byte_offset;
+  size_t character = 0, byte = 0;
+
+  while (character < goal_character) {
+    if (byte >= size)
+      return -1;
+
+    int32_t code_point;
+    byte += string_iterate(encoding, (uint8_t *)string + byte, size - byte, &code_point);
+    character++;
+  }
+
+  return byte;
+}
+
+size_t utf8_char_count(const std::string &input) {
+  return string_char_count(TSInputEncodingUTF8, input);
+}
+
+size_t utf16_char_count(const std::string &input) {
+  return string_char_count(TSInputEncodingUTF16, input);
+}
+
+long utf8_byte_for_character(const std::string &input, size_t byte_offset, size_t goal_character) {
+  return string_byte_for_character(TSInputEncodingUTF8, input, byte_offset, goal_character);
+}
+
+long utf16_byte_for_character(const std::string &input, size_t byte_offset, size_t goal_character) {
+  return string_byte_for_character(TSInputEncodingUTF16, input, byte_offset, goal_character);
+}
--- a/test/helpers/encoding_helpers.h
+++ b/test/helpers/encoding_helpers.h
@ -0,0 +1,15 @@
+#ifndef HELPERS_ENCODING_HELPERS_H_
+#define HELPERS_ENCODING_HELPERS_H_
+
+#include <string>
+#include "tree_sitter/runtime.h"
+
+size_t string_char_count(TSInputEncoding, const std::string &);
+size_t utf8_char_count(const std::string &);
+size_t utf16_char_count(const std::string &);
+
+long string_byte_for_character(TSInputEncoding, const std::string &, size_t byte_offset, size_t character);
+long utf8_byte_for_character(const std::string &, size_t byte_offset, size_t character);
+long utf16_byte_for_character(const std::string &, size_t byte_offset, size_t character);
+
+#endif  // HELPERS_ENCODING_HELPERS_H_
--- a/test/helpers/equals_pointer.h
+++ b/test/helpers/equals_pointer.h
@ -0,0 +1,37 @@
+#ifndef HELPERS_EQUALS_POINTER_H_
+#define HELPERS_EQUALS_POINTER_H_
+
+#include "bandit/bandit.h"
+#include <string>
+
+namespace snowhouse {
+  using namespace std;
+
+  template<typename ExpectedType>
+  struct EqualsPointerConstraint : Expression<EqualsPointerConstraint<ExpectedType>> {
+    EqualsPointerConstraint(const ExpectedType& expected) : expected(expected) {}
+
+    template<typename ActualType>
+    bool operator()(const ActualType& actual) const {
+      return *expected == *actual;
+    }
+
+    ExpectedType expected;
+  };
+
+  template<typename ExpectedType>
+  struct Stringizer<EqualsPointerConstraint<ExpectedType>> {
+    static string ToString(const EqualsPointerConstraint<ExpectedType>& constraint) {
+      ostringstream builder;
+      builder << "pointer to " << snowhouse::Stringize(constraint.expected);
+      return builder.str();
+    }
+  };
+
+  template<typename ExpectedType>
+  inline EqualsPointerConstraint<ExpectedType> EqualsPointer(const ExpectedType& expected) {
+    return EqualsPointerConstraint<ExpectedType>(expected);
+  }
+}
+
+#endif  // HELPERS_EQUALS_POINTER_H_
--- a/test/helpers/file_helpers.cc
+++ b/test/helpers/file_helpers.cc
@ -0,0 +1,61 @@
+#include "helpers/file_helpers.h"
+#include <sys/stat.h>
+#include <errno.h>
+#include <fstream>
+#include <dirent.h>
+
+using std::string;
+using std::ifstream;
+using std::istreambuf_iterator;
+using std::ofstream;
+using std::vector;
+
+bool file_exists(const string &path) {
+  struct stat file_stat;
+  return stat(path.c_str(), &file_stat) == 0;
+}
+
+int get_modified_time(const string &path) {
+  struct stat file_stat;
+  if (stat(path.c_str(), &file_stat) != 0) {
+    if (errno != ENOENT)
+      fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str());
+    return 0;
+  }
+  return file_stat.st_mtime;
+}
+
+string read_file(const string &path) {
+  ifstream file(path);
+  istreambuf_iterator<char> file_iterator(file), end_iterator;
+  string content(file_iterator, end_iterator);
+  file.close();
+  return content;
+}
+
+void write_file(const string &path, const string &content) {
+  ofstream file(path);
+  file << content;
+  file.close();
+}
+
+vector<string> list_directory(const string &path) {
+  vector<string> result;
+
+  DIR *dir = opendir(path.c_str());
+  if (!dir) {
+    printf("\nTest error - no such directory '%s'", path.c_str());
+    return result;
+  }
+
+  struct dirent *dir_entry;
+  while ((dir_entry = readdir(dir))) {
+    string name(dir_entry->d_name);
+    if (name != "." && name != "..") {
+      result.push_back(name);
+    }
+  }
+
+  closedir(dir);
+  return result;
+}
--- a/test/helpers/file_helpers.h
+++ b/test/helpers/file_helpers.h
@ -0,0 +1,14 @@
+#ifndef HELPERS_FILE_HELPERS_H_
+#define HELPERS_FILE_HELPERS_H_
+
+#include <string>
+#include <vector>
+#include <sys/stat.h>
+
+bool file_exists(const std::string &path);
+int get_modified_time(const std::string &path);
+std::string read_file(const std::string &path);
+void write_file(const std::string &path, const std::string &content);
+std::vector<std::string> list_directory(const std::string &path);
+
+#endif  // HELPERS_FILE_HELPERS_H_
--- a/test/helpers/load_language.cc
+++ b/test/helpers/load_language.cc
@ -0,0 +1,185 @@
+#include "test_helper.h"
+#include "helpers/load_language.h"
+#include "helpers/file_helpers.h"
+#include <unistd.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <map>
+#include <string>
+#include <fstream>
+#include <stdlib.h>
+#include "tree_sitter/compiler.h"
+
+using std::map;
+using std::string;
+using std::ifstream;
+using std::ofstream;
+using std::istreambuf_iterator;
+
+map<string, const TSLanguage *> loaded_languages;
+int libcompiler_mtime = -1;
+int compile_result_count = 0;
+
+const char *libcompiler_path =
+#if defined(__linux)
+  "out/Test/obj.target/libcompiler.a";
+#else
+  "out/Test/libcompiler.a";
+#endif
+
+static std::string run_command(const char *cmd, const char *args[]) {
+  int child_pid = fork();
+  if (child_pid < 0) {
+    return "fork failed";
+  }
+
+  if (child_pid == 0) {
+    close(0);
+    dup2(1, 0);
+    dup2(2, 1);
+    dup2(1, 2);
+    execvp(cmd, (char * const * )args);
+  }
+
+  int status;
+  do {
+    waitpid(child_pid, &status, 0);
+  } while (!WIFEXITED(status));
+
+  if (WEXITSTATUS(status) == 0) {
+    return "";
+  } else {
+    return "command failed";
+  }
+}
+
+static const TSLanguage *load_language(const string &source_filename,
+                                       const string &lib_filename,
+                                       const string &language_name,
+                                       string external_scanner_filename = "") {
+  string language_function_name = "tree_sitter_" + language_name;
+  string header_dir = getenv("PWD") + string("/include");
+  int source_mtime = get_modified_time(source_filename);
+  int header_mtime = get_modified_time(header_dir + "/tree_sitter/parser.h");
+  int lib_mtime = get_modified_time(lib_filename);
+  int external_scanner_mtime = get_modified_time(external_scanner_filename);
+
+  if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime ||
+      lib_mtime < external_scanner_mtime) {
+    const char *compiler_name = getenv("CXX");
+    if (!compiler_name) compiler_name = "c++";
+
+    vector<const char *> compile_args = {
+      compiler_name,
+      "-shared",
+      "-fPIC",
+      "-I", header_dir.c_str(),
+      "-o", lib_filename.c_str(),
+      "-x", "c",
+      source_filename.c_str()
+    };
+
+    if (!external_scanner_filename.empty()) {
+      compile_args.push_back("-g");
+      string extension = external_scanner_filename.substr(external_scanner_filename.rfind("."));
+      if (extension == ".c") {
+        compile_args.push_back("-xc");
+      } else {
+        compile_args.push_back("-xc++");
+      }
+      compile_args.push_back(external_scanner_filename.c_str());
+    }
+
+    compile_args.push_back(nullptr);
+
+    string compile_error = run_command(compiler_name, compile_args.data());
+    if (!compile_error.empty()) {
+      AssertThat(string(compile_error), IsEmpty());
+      return nullptr;
+    }
+  }
+
+  void *parser_lib = dlopen(lib_filename.c_str(), RTLD_NOW);
+  if (!parser_lib) {
+    std::string message(dlerror());
+    AssertThat(message, IsEmpty());
+    return nullptr;
+  }
+
+  void *language_function = dlsym(parser_lib, language_function_name.c_str());
+  if (!language_function) {
+    std::string message(dlerror());
+    AssertThat(message, IsEmpty());
+    return nullptr;
+  }
+
+  return reinterpret_cast<TSLanguage *(*)()>(language_function)();
+}
+
+const TSLanguage *load_test_language(const string &name,
+                                     const TSCompileResult &compile_result,
+                                     string external_scanner_path) {
+  if (compile_result.error_type != TSCompileErrorTypeNone) {
+    Assert::Failure(string("Compilation failed ") + compile_result.error_message);
+    return nullptr;
+  }
+
+  mkdir("out/tmp", 0777);
+  string source_filename = "out/tmp/compile-result-" + to_string(compile_result_count) + ".c";
+  string lib_filename = source_filename + ".so";
+  compile_result_count++;
+
+  ofstream source_file;
+  source_file.open(source_filename);
+  source_file << compile_result.code;
+  source_file.close();
+
+  auto language = load_language(source_filename, lib_filename, name, external_scanner_path);
+  free(compile_result.code);
+  return language;
+}
+
+const TSLanguage *load_real_language(const string &language_name) {
+  if (loaded_languages[language_name])
+    return loaded_languages[language_name];
+
+  string language_dir = string("test/fixtures/grammars/") + language_name;
+  string grammar_filename = language_dir + "/src/grammar.json";
+  string parser_filename = language_dir + "/src/parser.c";
+  string external_scanner_filename = language_dir + "/src/scanner.cc";
+  if (!file_exists(external_scanner_filename)) {
+    external_scanner_filename = "";
+  }
+
+  int grammar_mtime = get_modified_time(grammar_filename);
+  if (!grammar_mtime)
+    return nullptr;
+
+  if (libcompiler_mtime == -1) {
+    libcompiler_mtime = get_modified_time(libcompiler_path);
+    if (!libcompiler_mtime)
+      return nullptr;
+  }
+
+  int parser_mtime = get_modified_time(parser_filename);
+
+  if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) {
+    printf("\n" "Regenerating the %s parser...\n", language_name.c_str());
+
+    string grammar_json = read_file(grammar_filename);
+    TSCompileResult result = ts_compile_grammar(grammar_json.c_str());
+    if (result.error_type != TSCompileErrorTypeNone) {
+      fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message);
+      return nullptr;
+    }
+
+    write_file(parser_filename, result.code);
+  }
+
+  mkdir("out/tmp", 0777);
+  string lib_filename = "out/tmp/" + language_name + ".so";
+  const TSLanguage *language = load_language(parser_filename, lib_filename, language_name, external_scanner_filename);
+  loaded_languages[language_name] = language;
+  return language;
+};
--- a/test/helpers/load_language.h
+++ b/test/helpers/load_language.h
@ -0,0 +1,14 @@
+#ifndef HELPERS_LOAD_LANGUAGE_H_
+#define HELPERS_LOAD_LANGUAGE_H_
+
+#include "tree_sitter/compiler.h"
+#include "tree_sitter/runtime.h"
+#include <string>
+
+const TSLanguage *load_real_language(const std::string &name);
+
+const TSLanguage *load_test_language(const std::string &name,
+                                     const TSCompileResult &compile_result,
+                                     std::string external_scanner_path = "");
+
+#endif  // HELPERS_LOAD_LANGUAGE_H_
--- a/test/helpers/point_helpers.cc
+++ b/test/helpers/point_helpers.cc
@ -0,0 +1,45 @@
+#include "./point_helpers.h"
+#include <string>
+#include <ostream>
+#include "runtime/length.h"
+#include "tree_sitter/runtime.h"
+
+using namespace std;
+
+bool operator==(const TSPoint &left, const TSPoint &right) {
+  return left.row == right.row && left.column == right.column;
+}
+
+bool operator==(const TSRange &left, const TSRange &right) {
+  return left.start == right.start && left.end == right.end;
+}
+
+bool operator==(const Length &left, const Length &right) {
+  return left.bytes == right.bytes &&
+    left.chars == right.chars &&
+    left.extent == right.extent;
+}
+
+bool operator<(const TSPoint &left, const TSPoint &right) {
+  if (left.row < right.row) return true;
+  if (left.row > right.row) return false;
+
+  return left.column < right.column;
+}
+
+bool operator>(const TSPoint &left, const TSPoint &right) {
+  return right < left;
+}
+
+std::ostream &operator<<(std::ostream &stream, const TSPoint &point) {
+  return stream << "{" << point.row << ", " << point.column << "}";
+}
+
+std::ostream &operator<<(std::ostream &stream, const TSRange &range) {
+  return stream << "{" << range.start << ", " << range.end << "}";
+}
+
+ostream &operator<<(ostream &stream, const Length &length) {
+  return stream << "{chars:" << length.chars << ", bytes:" <<
+    length.bytes << ", extent:" << length.extent << "}";
+}
--- a/test/helpers/point_helpers.h
+++ b/test/helpers/point_helpers.h
@ -0,0 +1,23 @@
+#ifndef HELPERS_POINT_HELPERS_H_
+#define HELPERS_POINT_HELPERS_H_
+
+#include "runtime/length.h"
+#include <ostream>
+
+bool operator==(const TSPoint &left, const TSPoint &right);
+
+bool operator<(const TSPoint &left, const TSPoint &right);
+
+bool operator>(const TSPoint &left, const TSPoint &right);
+
+bool operator==(const TSRange &left, const TSRange &right);
+
+bool operator==(const Length &left, const Length &right);
+
+std::ostream &operator<<(std::ostream &stream, const TSPoint &point);
+
+std::ostream &operator<<(std::ostream &stream, const TSRange &range);
+
+std::ostream &operator<<(std::ostream &stream, const Length &length);
+
+#endif  // HELPERS_POINT_HELPERS_H_
--- a/test/helpers/random_helpers.cc
+++ b/test/helpers/random_helpers.cc
@ -0,0 +1,35 @@
+#include <string>
+#include <stdlib.h>
+
+using std::string;
+
+static string random_string(char min, char max) {
+  string result;
+  size_t length = random() % 12;
+  for (size_t i = 0; i < length; i++) {
+    char inserted_char = min + (random() % (max - min));
+    result += inserted_char;
+  }
+  return result;
+}
+
+static string random_char(string characters) {
+  size_t index = random() % characters.size();
+  return string() + characters[index];
+}
+
+string random_words(size_t count) {
+  string result;
+  bool just_inserted_word = false;
+  for (size_t i = 0; i < count; i++) {
+    if (random() % 10 < 6) {
+      result += random_char("!(){}[]<>+-=");
+    } else {
+      if (just_inserted_word)
+        result += " ";
+      result += random_string('a', 'z');
+      just_inserted_word = true;
+    }
+  }
+  return result;
+}
--- a/test/helpers/random_helpers.h
+++ b/test/helpers/random_helpers.h
@ -0,0 +1,8 @@
+#ifndef HELPERS_RANDOM_HELPERS_H_
+#define HELPERS_RANDOM_HELPERS_H_
+
+#include <string>
+
+std::string random_words(size_t count);
+
+#endif  // HELPERS_RANDOM_HELPERS_H_
--- a/test/helpers/read_test_entries.cc
+++ b/test/helpers/read_test_entries.cc
@ -0,0 +1,94 @@
+#include "helpers/read_test_entries.h"
+#include <assert.h>
+#include <string>
+#include <regex>
+#include "helpers/file_helpers.h"
+
+using std::regex;
+using std::regex_search;
+using std::regex_replace;
+using std::regex_constants::extended;
+using std::smatch;
+using std::string;
+using std::vector;
+
+string fixtures_dir = "test/fixtures/";
+
+static string trim_output(const string &input) {
+  string result(input);
+  result = regex_replace(result, regex("[\n\t ]+", extended), string(" "));
+  result = regex_replace(result, regex("^ ", extended), string(""));
+  result = regex_replace(result, regex(" $", extended), string(""));
+  result = regex_replace(result, regex("\\) \\)", extended), string("))"));
+  return result;
+}
+
+static vector<TestEntry> parse_test_entries(string content) {
+  regex header_pattern("===+\n"  "([^=]+)\n"  "===+\n", extended);
+  regex separator_pattern("---+\r?\n", extended);
+  vector<string> descriptions;
+  vector<string> bodies;
+
+  for (;;) {
+    smatch matches;
+    if (!regex_search(content, matches, header_pattern) || matches.empty())
+      break;
+
+    string description = matches[1].str();
+    descriptions.push_back(description);
+
+    if (!bodies.empty())
+      bodies.back().erase(matches.position());
+    content.erase(0, matches.position() + matches[0].length());
+    bodies.push_back(content);
+  }
+
+  vector<TestEntry> result;
+  for (size_t i = 0; i < descriptions.size(); i++) {
+    string body = bodies[i];
+    smatch matches;
+    if (regex_search(body, matches, separator_pattern)) {
+      result.push_back({
+        descriptions[i],
+        body.substr(0, matches.position() - 1),
+        trim_output(body.substr(matches.position() + matches[0].length()))
+      });
+    } else {
+      puts(("Invalid corpus entry with description: " + descriptions[i]).c_str());
+      abort();
+    }
+  }
+
+  return result;
+}
+
+vector<TestEntry> read_real_language_corpus(string language_name) {
+  vector<TestEntry> result;
+
+  string test_directory = fixtures_dir + "grammars/" + language_name + "/grammar_test";
+  for (string &test_filename : list_directory(test_directory)) {
+    for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
+      result.push_back(entry);
+    }
+  }
+
+  string error_test_filename = fixtures_dir + "/error_corpus/" + language_name + "_errors.txt";
+  for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) {
+    result.push_back(entry);
+  }
+
+  return result;
+}
+
+vector<TestEntry> read_test_language_corpus(string language_name) {
+  vector<TestEntry> result;
+
+  string test_directory = fixtures_dir + "test_grammars/" + language_name;
+  for (string &test_filename : list_directory(test_directory)) {
+    for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
+      result.push_back(entry);
+    }
+  }
+
+  return result;
+}
--- a/test/helpers/read_test_entries.h
+++ b/test/helpers/read_test_entries.h
@ -0,0 +1,16 @@
+#ifndef HELPERS_READ_TEST_ENTRIES_H_
+#define HELPERS_READ_TEST_ENTRIES_H_
+
+#include <string>
+#include <vector>
+
+struct TestEntry {
+  std::string description;
+	std::string input;
+	std::string tree_string;
+};
+
+std::vector<TestEntry> read_real_language_corpus(std::string name);
+std::vector<TestEntry> read_test_language_corpus(std::string name);
+
+#endif
--- a/test/helpers/record_alloc.cc
+++ b/test/helpers/record_alloc.cc
@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include "bandit/bandit.h"
+
+using std::map;
+using std::set;
+
+static bool _enabled = false;
+static size_t _allocation_count = 0;
+static map<void *, size_t> _outstanding_allocations;
+
+namespace record_alloc {
+
+void start() {
+  _enabled = true;
+  _allocation_count = 0;
+  _outstanding_allocations.clear();
+}
+
+void stop() {
+  _enabled = false;
+}
+
+set<size_t> outstanding_allocation_indices() {
+  set<size_t> result;
+  for (const auto &entry : _outstanding_allocations) {
+    result.insert(entry.second);
+  }
+  return result;
+}
+
+size_t allocation_count() {
+  return _allocation_count;
+}
+
+}  // namespace record_alloc
+
+extern "C" {
+
+static void *record_allocation(void *result) {
+  if (!_enabled)
+    return result;
+
+  _outstanding_allocations[result] = _allocation_count;
+  _allocation_count++;
+  return result;
+}
+
+static void record_deallocation(void *pointer) {
+  if (!_enabled)
+    return;
+
+  auto entry = _outstanding_allocations.find(pointer);
+  if (entry != _outstanding_allocations.end()) {
+    _outstanding_allocations.erase(entry);
+  }
+}
+
+void *ts_record_malloc(size_t size) {
+  return record_allocation(malloc(size));
+}
+
+void *ts_record_realloc(void *pointer, size_t size) {
+  record_deallocation(pointer);
+  return record_allocation(realloc(pointer, size));
+}
+
+void *ts_record_calloc(size_t count, size_t size) {
+  return record_allocation(calloc(count, size));
+}
+
+void ts_record_free(void *pointer) {
+  free(pointer);
+  record_deallocation(pointer);
+}
+
+bool ts_record_allocations_toggle(bool value) {
+  bool previous_value = _enabled;
+  _enabled = value;
+  return previous_value;
+}
+
+}
--- a/test/helpers/record_alloc.h
+++ b/test/helpers/record_alloc.h
@ -0,0 +1,16 @@
+#ifndef HELPERS_RECORD_ALLOC_H_
+#define HELPERS_RECORD_ALLOC_H_
+
+#include <set>
+
+namespace record_alloc {
+
+void start();
+void stop();
+void fail_at_allocation_index(size_t failure_index);
+std::set<size_t> outstanding_allocation_indices();
+size_t allocation_count();
+
+}  // namespace record_alloc
+
+#endif  // HELPERS_RECORD_ALLOC_H_
--- a/test/helpers/rule_helpers.cc
+++ b/test/helpers/rule_helpers.cc
@ -0,0 +1,62 @@
+#include "rule_helpers.h"
+#include <memory>
+#include "compiler/rules/symbol.h"
+#include "compiler/variable.h"
+#include "compiler/lexical_grammar.h"
+
+namespace tree_sitter {
+  using std::make_shared;
+  using std::set;
+  using std::map;
+  using std::ostream;
+  using std::string;
+  using std::to_string;
+  using rules::Symbol;
+
+  rule_ptr character(const set<uint32_t> &ranges) {
+    return character(ranges, true);
+  }
+
+  rule_ptr character(const set<uint32_t> &chars, bool sign) {
+    rules::CharacterSet result;
+    if (sign) {
+      for (uint32_t c : chars)
+        result.include(c);
+    } else {
+      result.include_all();
+      for (uint32_t c : chars)
+        result.exclude(c);
+    }
+    return result.copy();
+  }
+
+  rule_ptr i_sym(size_t index) {
+    return make_shared<Symbol>(index, Symbol::NonTerminal);
+  }
+
+  rule_ptr i_token(size_t index) {
+    return make_shared<Symbol>(index, Symbol::Terminal);
+  }
+
+  rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) {
+    return rules::Metadata::build(rule, params);
+  }
+
+  rule_ptr active_prec(int precedence, rule_ptr rule) {
+    rules::MetadataParams params;
+    params.precedence = precedence;
+    params.has_precedence = true;
+    params.is_active = true;
+    return rules::Metadata::build(rule, params);
+  }
+
+  bool operator==(const Variable &left, const Variable &right) {
+    return left.name == right.name && left.rule->operator==(*right.rule) &&
+      left.type == right.type;
+  }
+
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
+    return left.name == right.name && left.rule->operator==(*right.rule) &&
+      left.type == right.type && left.is_string == right.is_string;
+  }
+}
--- a/test/helpers/rule_helpers.h
+++ b/test/helpers/rule_helpers.h
@ -0,0 +1,25 @@
+#ifndef HELPERS_RULE_HELPERS_H_
+#define HELPERS_RULE_HELPERS_H_
+
+#include "tree_sitter/compiler.h"
+#include "compiler/rules.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/variable.h"
+
+namespace tree_sitter {
+  rule_ptr metadata(rule_ptr, rules::MetadataParams params);
+  rule_ptr character(const std::set<uint32_t> &);
+  rule_ptr character(const std::set<uint32_t> &, bool sign);
+  rule_ptr i_sym(size_t index);
+  rule_ptr i_token(size_t index);
+  rule_ptr active_prec(int precedence, rule_ptr);
+
+  struct Variable;
+  struct LexicalVariable;
+
+  bool operator==(const Variable &left, const Variable &right);
+  bool operator==(const LexicalVariable &left, const LexicalVariable &right);
+}
+
+#endif  // HELPERS_RULE_HELPERS_H_
--- a/test/helpers/scope_sequence.cc
+++ b/test/helpers/scope_sequence.cc
@ -0,0 +1,106 @@
+#include "./scope_sequence.h"
+
+#include "bandit/bandit.h"
+#include <sstream>
+#include "helpers/stream_methods.h"
+#include "helpers/point_helpers.h"
+
+using std::string;
+using std::cout;
+
+static void append_text_to_scope_sequence(ScopeSequence *sequence,
+                                          ScopeStack *current_scopes,
+                                          const std::string &text,
+                                          size_t length) {
+  for (size_t i = 0; i < length; i++) {
+    string character(1, text[sequence->size()]);
+    sequence->push_back(*current_scopes);
+    sequence->back().push_back("'" + character + "'");
+  }
+}
+
+static void append_to_scope_sequence(ScopeSequence *sequence,
+                                     ScopeStack *current_scopes,
+                                     TSNode node, TSDocument *document,
+                                     const std::string &text) {
+  append_text_to_scope_sequence(
+    sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size()
+  );
+
+  current_scopes->push_back(ts_node_type(node, document));
+
+  for (size_t i = 0, n = ts_node_child_count(node); i < n; i++) {
+    TSNode child = ts_node_child(node, i);
+    append_to_scope_sequence(sequence, current_scopes, child, document, text);
+  }
+
+  append_text_to_scope_sequence(
+    sequence, current_scopes, text, ts_node_end_byte(node) - sequence->size()
+  );
+
+  current_scopes->pop_back();
+}
+
+ScopeSequence build_scope_sequence(TSDocument *document, const std::string &text) {
+  ScopeSequence sequence;
+  ScopeStack current_scopes;
+  TSNode node = ts_document_root_node(document);
+  append_to_scope_sequence(&sequence, &current_scopes, node, document, text);
+  return sequence;
+}
+
+bool operator<=(const TSPoint &left, const TSPoint &right) {
+  if (left.row < right.row)
+    return true;
+  else if (left.row == right.row)
+    return left.column <= right.column;
+  else
+    return false;
+}
+
+void verify_changed_ranges(const ScopeSequence &old_sequence, const ScopeSequence &new_sequence,
+                           const string &text, TSRange *ranges, size_t range_count) {
+  TSPoint current_position = {0, 0};
+  for (size_t i = 0; i < old_sequence.size(); i++) {
+    if (text[i] == '\n') {
+      current_position.row++;
+      current_position.column = 0;
+      continue;
+    }
+
+    const ScopeStack &old_scopes = old_sequence[i];
+    const ScopeStack &new_scopes = new_sequence[i];
+    if (old_scopes != new_scopes) {
+      bool found_containing_range = false;
+      for (size_t j = 0; j < range_count; j++) {
+        TSRange range = ranges[j];
+        if (range.start <= current_position && current_position <= range.end) {
+          found_containing_range = true;
+          break;
+        }
+      }
+
+      if (!found_containing_range) {
+        std::stringstream message_stream;
+        message_stream << "Found changed scope outside of any invalidated range;\n";
+        message_stream << "Position: " << current_position << "\n";
+        message_stream << "Byte index: " << i << "\n";
+        size_t line_start_index = i - current_position.column;
+        size_t line_end_index = text.find_first_of('\n', i);
+        message_stream << "Line: " << text.substr(line_start_index, line_end_index - line_start_index) << "\n";
+        for (size_t j = 0; j < current_position.column + string("Line: ").size(); j++)
+          message_stream << " ";
+        message_stream << "^\n";
+        message_stream << "Old scopes: " << old_scopes << "\n";
+        message_stream << "New scopes: " << new_scopes << "\n";
+        message_stream << "Invalidated ranges:\n";
+        for (size_t j = 0; j < range_count; j++) {
+          message_stream << "  " << ranges[j] << "\n";
+        }
+        Assert::Failure(message_stream.str());
+      }
+    }
+
+    current_position.column++;
+  }
+}
--- a/test/helpers/scope_sequence.h
+++ b/test/helpers/scope_sequence.h
@ -0,0 +1,16 @@
+#ifndef HELPERS_SCOPE_SEQUENCE_H_
+#define HELPERS_SCOPE_SEQUENCE_H_
+
+#include <string>
+#include <vector>
+#include "tree_sitter/runtime.h"
+
+typedef std::string Scope;
+typedef std::vector<Scope> ScopeStack;
+typedef std::vector<ScopeStack> ScopeSequence;
+
+ScopeSequence build_scope_sequence(TSDocument *document, const std::string &text);
+
+void verify_changed_ranges(const ScopeSequence &old, const ScopeSequence &new_sequence, const std::string &text, TSRange *ranges, size_t range_count);
+
+#endif  // HELPERS_SCOPE_SEQUENCE_H_
--- a/test/helpers/spy_input.cc
+++ b/test/helpers/spy_input.cc
@ -0,0 +1,133 @@
+#include "helpers/spy_input.h"
+#include "helpers/encoding_helpers.h"
+#include <string.h>
+#include <algorithm>
+#include <assert.h>
+
+using std::pair;
+using std::string;
+
+static const size_t UTF8_MAX_CHAR_SIZE = 4;
+
+SpyInput::SpyInput(string content, size_t chars_per_chunk) :
+  chars_per_chunk(chars_per_chunk),
+  buffer_size(UTF8_MAX_CHAR_SIZE * chars_per_chunk),
+  buffer(new char[buffer_size]),
+  byte_offset(0),
+  content(content),
+  encoding(TSInputEncodingUTF8),
+  strings_read({""}) {}
+
+SpyInput::~SpyInput() {
+  delete[] buffer;
+}
+
+const char * SpyInput::read(void *payload, uint32_t *bytes_read) {
+  auto spy = static_cast<SpyInput *>(payload);
+
+  if (spy->byte_offset > spy->content.size()) {
+    *bytes_read = 0;
+    return "";
+  }
+
+  long byte_count = string_byte_for_character(spy->encoding, spy->content, spy->byte_offset, spy->chars_per_chunk);
+  if (byte_count < 0)
+    byte_count = spy->content.size() - spy->byte_offset;
+
+  string result = spy->content.substr(spy->byte_offset, byte_count);
+  *bytes_read = byte_count;
+  spy->strings_read.back() += result;
+  spy->byte_offset += byte_count;
+
+  /*
+   * This class stores its entire `content` in a contiguous buffer, but we want
+   * to ensure that the code under test cannot accidentally read more than
+   * `*bytes_read` bytes past the returned pointer. To make sure that this type
+   * of error does not fly, we copy the chunk into a zeroed-out buffer and
+   * return a reference to that buffer, rather than a pointer into the main
+   * content.
+   */
+  memset(spy->buffer, 0, spy->buffer_size);
+  memcpy(spy->buffer, result.data(), byte_count);
+  return spy->buffer;
+}
+
+int SpyInput::seek(void *payload, uint32_t character, uint32_t byte) {
+  auto spy = static_cast<SpyInput *>(payload);
+  if (spy->strings_read.size() == 0 || spy->strings_read.back().size() > 0)
+    spy->strings_read.push_back("");
+  spy->byte_offset = byte;
+  return 0;
+}
+
+TSInput SpyInput::input() {
+  TSInput result;
+  result.payload = this;
+  result.encoding = encoding;
+  result.seek = seek;
+  result.read = read;
+  return result;
+}
+
+static TSPoint get_extent(string text) {
+  TSPoint result = {0, 0};
+  for (auto i = text.begin(); i != text.end(); i++) {
+    if (*i == '\n') {
+      result.row++;
+      result.column = 0;
+    } else {
+      result.column++;
+    }
+  }
+  return result;
+}
+
+TSInputEdit SpyInput::replace(size_t start_byte, size_t bytes_removed, string text) {
+  auto swap = swap_substr(start_byte, bytes_removed, text);
+  size_t bytes_added = text.size();
+  undo_stack.push_back(SpyInputEdit{start_byte, bytes_added, swap.first});
+  TSInputEdit result = {};
+  result.start_byte = start_byte;
+  result.bytes_added = bytes_added;
+  result.bytes_removed = bytes_removed;
+  result.start_point = swap.second;
+  result.extent_removed = get_extent(swap.first);
+  result.extent_added = get_extent(text);
+  return result;
+}
+
+TSInputEdit SpyInput::undo() {
+  SpyInputEdit entry = undo_stack.back();
+  undo_stack.pop_back();
+  auto swap = swap_substr(entry.start_byte, entry.bytes_removed, entry.text_inserted);
+  TSInputEdit result;
+  result.start_byte = entry.start_byte;
+  result.bytes_removed = entry.bytes_removed;
+  result.bytes_added = entry.text_inserted.size();
+  result.start_point = swap.second;
+  result.extent_removed = get_extent(swap.first);
+  result.extent_added = get_extent(entry.text_inserted);
+  return result;
+}
+
+pair<string, TSPoint> SpyInput::swap_substr(size_t start_byte, size_t bytes_removed, string text) {
+  TSPoint start_position = {0, 0};
+  for (auto i = content.begin(), n = content.begin() + start_byte; i < n; i++) {
+    if (*i == '\n') {
+      start_position.row++;
+      start_position.column = 0;
+    } else {
+      start_position.column++;
+    }
+  }
+
+  string text_removed = content.substr(start_byte, bytes_removed);
+  content.erase(start_byte, bytes_removed);
+  content.insert(start_byte, text);
+
+  return {text_removed, start_position};
+}
+
+void SpyInput::clear() {
+  strings_read.clear();
+}
--- a/test/helpers/spy_input.h
+++ b/test/helpers/spy_input.h
@ -0,0 +1,39 @@
+#ifndef HELPERS_SPY_INPUT_H_
+#define HELPERS_SPY_INPUT_H_
+
+#include <string>
+#include <vector>
+#include "tree_sitter/runtime.h"
+
+struct SpyInputEdit {
+  size_t start_byte;
+  size_t bytes_removed;
+  std::string text_inserted;
+};
+
+class SpyInput {
+  uint32_t chars_per_chunk;
+  uint32_t buffer_size;
+  char *buffer;
+  uint32_t byte_offset;
+  std::vector<SpyInputEdit> undo_stack;
+
+  static const char * read(void *, uint32_t *);
+  static int seek(void *, uint32_t, uint32_t);
+  std::pair<std::string, TSPoint> swap_substr(size_t, size_t, std::string);
+
+ public:
+  SpyInput(std::string content, size_t chars_per_chunk);
+  ~SpyInput();
+
+  TSInput input();
+  void clear();
+  TSInputEdit replace(size_t start_char, size_t chars_removed, std::string text);
+  TSInputEdit undo();
+
+  std::string content;
+  TSInputEncoding encoding;
+  std::vector<std::string> strings_read;
+};
+
+#endif  // HELPERS_SPY_INPUT_H_
--- a/test/helpers/spy_logger.cc
+++ b/test/helpers/spy_logger.cc
@ -0,0 +1,22 @@
+#include "helpers/spy_logger.h"
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+static void spy_log(void *data, TSLogType type, const char *msg) {
+  SpyLogger *logger = static_cast<SpyLogger *>(data);
+  logger->messages.push_back(msg);
+}
+
+TSLogger SpyLogger::logger() {
+  TSLogger result;
+  result.payload = (void *)this;
+  result.log = spy_log;
+  return result;
+}
+
+void SpyLogger::clear() {
+  messages.clear();
+}
--- a/test/helpers/spy_logger.h
+++ b/test/helpers/spy_logger.h
@ -0,0 +1,15 @@
+#ifndef HELPERS_SPY_LOGGER_H_
+#define HELPERS_SPY_LOGGER_H_
+
+#include <string>
+#include <vector>
+#include "tree_sitter/runtime.h"
+
+class SpyLogger {
+ public:
+  void clear();
+  TSLogger logger();
+  std::vector<std::string> messages;
+};
+
+#endif  // HELPERS_SPY_DEBUGGER_H_
--- a/test/helpers/stderr_logger.cc
+++ b/test/helpers/stderr_logger.cc
@ -0,0 +1,22 @@
+#include "tree_sitter/runtime.h"
+#include <stdio.h>
+
+static void log(void *payload, TSLogType type, const char *msg) {
+  bool include_lexing = (bool)payload;
+  switch (type) {
+    case TSLogTypeParse:
+      fprintf(stderr, "* %s\n", msg);
+      break;
+    case TSLogTypeLex:
+      if (include_lexing)
+        fprintf(stderr, "  %s\n", msg);
+      break;
+  }
+}
+
+TSLogger stderr_logger_new(bool include_lexing) {
+  TSLogger result;
+  result.payload = (void *)include_lexing;
+  result.log = log;
+  return result;
+}
--- a/test/helpers/stderr_logger.h
+++ b/test/helpers/stderr_logger.h
@ -0,0 +1,8 @@
+#ifndef HELPERS_STDERR_LOGGER_H_
+#define HELPERS_STDERR_LOGGER_H_
+
+#include "tree_sitter/runtime.h"
+
+TSLogger stderr_logger_new(bool include_lexing);
+
+#endif  // HELPERS_STDERR_LOGGER_H_
--- a/test/helpers/stream_methods.cc
+++ b/test/helpers/stream_methods.cc
@ -0,0 +1,146 @@
+#include "helpers/stream_methods.h"
+#include "test_helper.h"
+#include "tree_sitter/compiler.h"
+#include "compiler/parse_table.h"
+#include "compiler/syntax_grammar.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/build_tables/parse_item.h"
+#include "compiler/build_tables/lex_item.h"
+
+namespace tree_sitter {
+
+ostream &operator<<(ostream &stream, const Grammar &grammar) {
+  stream << string("#<grammar");
+  stream << " rules: " << grammar.rules;
+  return stream << string("}>");
+}
+
+ostream &operator<<(ostream &stream, const CompileError &error) {
+  if (error.type)
+    return stream << (string("#<compile-error '") + error.message + "'>");
+  else
+    return stream << string("#<no-compile-error>");
+}
+
+ostream &operator<<(ostream &stream, const Rule &rule) {
+  return stream << rule.to_string();
+}
+
+ostream &operator<<(ostream &stream, const rule_ptr &rule) {
+  if (rule.get())
+    stream << *rule;
+  else
+    stream << string("(null-rule)");
+  return stream;
+}
+
+ostream &operator<<(ostream &stream, const Variable &variable) {
+  return stream << string("{") << variable.name << string(", ") << variable.rule << string(", ") << to_string(variable.type) << string("}");
+}
+
+ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
+  return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
+}
+
+ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
+  return stream << "{" << variable.name << ", " << variable.rule << ", " <<
+    to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
+}
+
+std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
+  return stream << string("#<advance ") + to_string(action.state_index) + ">";
+}
+
+std::ostream &operator<<(std::ostream &stream, const AcceptTokenAction &action) {
+  return stream << string("#<accept ") + to_string(action.symbol.index) + ">";
+}
+
+ostream &operator<<(ostream &stream, const ParseAction &action) {
+  switch (action.type) {
+    case ParseActionTypeError:
+      return stream << string("#<error>");
+    case ParseActionTypeAccept:
+      return stream << string("#<accept>");
+    case ParseActionTypeShift:
+      return stream << string("#<shift state:") << to_string(action.state_index) << ">";
+    case ParseActionTypeReduce:
+      return stream << ("#<reduce sym" + to_string(action.symbol.index) + " " +
+                        to_string(action.consumed_symbol_count) + ">");
+    default:
+      return stream;
+  }
+}
+
+ostream &operator<<(ostream &stream, const ParseTableEntry &entry) {
+  return stream << entry.actions;
+}
+
+ostream &operator<<(ostream &stream, const ParseState &state) {
+  stream << string("#<parse_state terminal_entries:");
+  stream << state.terminal_entries;
+  stream << " nonterminal_entries: " << state.nonterminal_entries;
+  return stream << string(">");
+}
+
+ostream &operator<<(ostream &stream, const ExternalToken &external_token) {
+  return stream << "{" << external_token.name << ", " << external_token.type <<
+    "," << external_token.corresponding_internal_token << "}";
+}
+
+ostream &operator<<(ostream &stream, const ProductionStep &step) {
+  stream << "(symbol: " << step.symbol << ", precedence:" << to_string(step.precedence);
+  stream << ", associativity: ";
+  switch (step.associativity) {
+    case rules::AssociativityLeft:
+      return stream << "left)";
+    case rules::AssociativityRight:
+      return stream << "right)";
+    default:
+      return stream << "none)";
+  }
+}
+
+ostream &operator<<(ostream &stream, const PrecedenceRange &range) {
+  if (range.empty)
+    return stream << string("{empty}");
+  else
+    return stream << string("{") << to_string(range.min) << string(", ") << to_string(range.max) << string("}");
+}
+
+namespace build_tables {
+
+ostream &operator<<(ostream &stream, const LexItem &item) {
+  return stream << string("(item ") << item.lhs << string(" ") << *item.rule
+                << string(")");
+}
+
+ostream &operator<<(ostream &stream, const LexItemSet &item_set) {
+  return stream << item_set.entries;
+}
+
+ostream &operator<<(ostream &stream, const LexItemSet::Transition &transition) {
+  return stream << "{dest: " << transition.destination << ", prec: " << transition.precedence << "}";
+}
+
+ostream &operator<<(ostream &stream, const ParseItem &item) {
+  return stream << string("(item variable:") << to_string(item.variable_index)
+                << string(" production:") << to_string((size_t)item.production % 1000)
+                << string(" step:") << to_string(item.step_index)
+                << string(")");
+}
+
+std::ostream &operator<<(std::ostream &stream, const ParseItemSet &item_set) {
+  return stream << item_set.entries;
+}
+
+std::ostream &operator<<(std::ostream &stream, const LookaheadSet &set) {
+  if (set.entries.get()) {
+    return stream << *set.entries;
+  } else {
+    return stream << "{}";
+  }
+}
+
+}  // namespace build_tables
+
+} // namespace tree_sitter
--- a/test/helpers/stream_methods.h
+++ b/test/helpers/stream_methods.h
@ -0,0 +1,138 @@
+#ifndef HELPERS_STREAM_METHODS_H_
+#define HELPERS_STREAM_METHODS_H_
+
+#include <iostream>
+#include <set>
+#include <unordered_map>
+#include <map>
+#include <unordered_set>
+#include <vector>
+#include "compiler/grammar.h"
+#include "compiler/compile_error.h"
+#include "compiler/build_tables/lex_item.h"
+
+using std::cout;
+
+namespace std {
+
+template<typename T>
+inline std::ostream& operator<<(std::ostream &stream, const std::vector<T> &vector) {
+  stream << std::string("(vector: ");
+  bool started = false;
+  for (auto item : vector) {
+    if (started) stream << std::string(", ");
+    stream << item;
+    started = true;
+  }
+  return stream << ")";
+}
+
+template<typename T>
+inline std::ostream& operator<<(std::ostream &stream, const std::set<T> &set) {
+  stream << std::string("(set: ");
+  bool started = false;
+  for (auto item : set) {
+    if (started) stream << std::string(", ");
+    stream << item;
+    started = true;
+  }
+  return stream << ")";
+}
+
+template<typename T, typename H, typename E>
+inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set<T, H, E> &set) {
+  stream << std::string("(set: ");
+  bool started = false;
+  for (auto item : set) {
+    if (started) stream << std::string(", ");
+    stream << item;
+    started = true;
+  }
+  return stream << ")";
+}
+
+template<typename TKey, typename TValue>
+inline std::ostream& operator<<(std::ostream &stream, const std::map<TKey, TValue> &map) {
+  stream << std::string("(map: ");
+  bool started = false;
+  for (auto pair : map) {
+    if (started) stream << std::string(", ");
+    stream << pair.first;
+    stream << std::string(" => ");
+    stream << pair.second;
+    started = true;
+  }
+  return stream << ")";
+}
+
+template<typename TKey, typename TValue>
+inline std::ostream& operator<<(std::ostream &stream, const std::unordered_map<TKey, TValue> &map) {
+  stream << std::string("(map: ");
+  bool started = false;
+  for (auto pair : map) {
+    if (started) stream << std::string(", ");
+    stream << pair.first;
+    stream << std::string(" => ");
+    stream << pair.second;
+    started = true;
+  }
+  return stream << ")";
+}
+
+template<typename T1, typename T2>
+inline std::ostream& operator<<(std::ostream &stream, const std::pair<T1, T2> &pair) {
+  return stream << "{" << pair.first << ", " << pair.second << "}";
+}
+
+}  // namespace std
+
+namespace tree_sitter {
+
+using std::ostream;
+using std::string;
+using std::to_string;
+struct Variable;
+struct SyntaxVariable;
+struct LexicalVariable;
+struct AdvanceAction;
+struct AcceptTokenAction;
+struct ParseAction;
+struct ParseState;
+struct ExternalToken;
+struct ProductionStep;
+struct PrecedenceRange;
+
+ostream &operator<<(ostream &, const Grammar &);
+ostream &operator<<(ostream &, const CompileError &);
+ostream &operator<<(ostream &, const Rule &);
+ostream &operator<<(ostream &, const rule_ptr &);
+ostream &operator<<(ostream &, const Variable &);
+ostream &operator<<(ostream &, const SyntaxVariable &);
+ostream &operator<<(ostream &, const LexicalVariable &);
+ostream &operator<<(ostream &, const AdvanceAction &);
+ostream &operator<<(ostream &, const AcceptTokenAction &);
+ostream &operator<<(ostream &, const ParseAction &);
+ostream &operator<<(ostream &, const ParseState &);
+ostream &operator<<(ostream &, const ExternalToken &);
+ostream &operator<<(ostream &, const ProductionStep &);
+ostream &operator<<(ostream &, const PrecedenceRange &);
+
+namespace build_tables {
+
+class LexItem;
+class LexItemSet;
+struct ParseItem;
+struct ParseItemSet;
+class LookaheadSet;
+
+ostream &operator<<(ostream &, const LexItem &);
+ostream &operator<<(ostream &, const LexItemSet &);
+ostream &operator<<(ostream &, const LexItemSet::Transition &);
+ostream &operator<<(ostream &, const ParseItem &);
+ostream &operator<<(ostream &, const ParseItemSet &);
+ostream &operator<<(ostream &, const LookaheadSet &);
+
+}  // namespace build_tables
+}  // namespace tree_sitter
+
+#endif  // HELPERS_STREAM_METHODS_H_
--- a/test/helpers/tree_helpers.cc
+++ b/test/helpers/tree_helpers.cc
@ -0,0 +1,50 @@
+#include "helpers/tree_helpers.h"
+#include "runtime/document.h"
+#include "runtime/node.h"
+#include <ostream>
+
+using std::string;
+using std::to_string;
+using std::ostream;
+
+const char *symbol_names[24] = {
+  "ERROR", "END",  "two", "three", "four", "five", "six", "seven", "eight",
+  "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
+  "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one",
+  "twenty-two", "twenty-three"
+};
+
+Tree ** tree_array(std::vector<Tree *> trees) {
+  Tree ** result = (Tree **)calloc(trees.size(), sizeof(Tree *));
+  for (size_t i = 0; i < trees.size(); i++)
+      result[i] = trees[i];
+  return result;
+}
+
+ostream &operator<<(std::ostream &stream, const Tree *tree) {
+  static TSLanguage DUMMY_LANGUAGE = {};
+  static TSDocument DUMMY_DOCUMENT = {};
+  DUMMY_DOCUMENT.parser.language = &DUMMY_LANGUAGE;
+  DUMMY_LANGUAGE.symbol_names = symbol_names;
+  TSNode node;
+  node.data = tree;
+  return stream << string(ts_node_string(node, &DUMMY_DOCUMENT));
+}
+
+ostream &operator<<(ostream &stream, const TSNode &node) {
+  return stream << string("{") << (const Tree *)node.data <<
+    string(", ") << to_string(ts_node_start_char(node)) << string("}");
+}
+
+bool operator==(const TSNode &left, const TSNode &right) {
+  return ts_node_eq(left, right);
+}
+
+bool operator==(const std::vector<Tree *> &vec, const TreeArray &array) {
+  if (vec.size() != array.size)
+    return false;
+  for (size_t i = 0; i < array.size; i++)
+    if (array.contents[i] != vec[i])
+      return false;
+  return true;
+}
--- a/test/helpers/tree_helpers.h
+++ b/test/helpers/tree_helpers.h
@ -0,0 +1,16 @@
+#ifndef HELPERS_TREE_HELPERS_H_
+#define HELPERS_TREE_HELPERS_H_
+
+#include "runtime/tree.h"
+#include <vector>
+#include <string>
+
+extern const char *symbol_names[24];
+Tree ** tree_array(std::vector<Tree *> trees);
+
+std::ostream &operator<<(std::ostream &stream, const Tree *tree);
+std::ostream &operator<<(std::ostream &stream, const TSNode &node);
+bool operator==(const TSNode &left, const TSNode &right);
+bool operator==(const std::vector<Tree *> &right, const TreeArray &array);
+
+#endif  // HELPERS_TREE_HELPERS_H_
--- a/test/integration/real_grammars.cc
+++ b/test/integration/real_grammars.cc
@ -0,0 +1,181 @@
+#include "test_helper.h"
+#include "runtime/alloc.h"
+#include "helpers/load_language.h"
+#include "helpers/read_test_entries.h"
+#include "helpers/spy_input.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/point_helpers.h"
+#include "helpers/encoding_helpers.h"
+#include "helpers/record_alloc.h"
+#include "helpers/random_helpers.h"
+#include "helpers/scope_sequence.h"
+#include <set>
+
+static void assert_consistent_sizes(TSNode node) {
+  size_t child_count = ts_node_child_count(node);
+  size_t start_byte = ts_node_start_byte(node);
+  size_t end_byte = ts_node_end_byte(node);
+  TSPoint start_point = ts_node_start_point(node);
+  TSPoint end_point = ts_node_end_point(node);
+  bool some_child_has_changes = false;
+
+  AssertThat(start_byte, !IsGreaterThan(end_byte));
+  AssertThat(start_point, !IsGreaterThan(end_point));
+
+  size_t last_child_end_byte = start_byte;
+  TSPoint last_child_end_point = start_point;
+
+  for (size_t i = 0; i < child_count; i++) {
+    TSNode child = ts_node_child(node, i);
+    size_t child_start_byte = ts_node_start_byte(child);
+    TSPoint child_start_point = ts_node_start_point(child);
+
+    AssertThat(child_start_byte, !IsLessThan(last_child_end_byte));
+    AssertThat(child_start_point, !IsLessThan(last_child_end_point));
+    assert_consistent_sizes(child);
+    if (ts_node_has_changes(child))
+      some_child_has_changes = true;
+
+    last_child_end_byte = ts_node_end_byte(child);
+    last_child_end_point = ts_node_end_point(child);
+  }
+
+  if (child_count > 0) {
+    AssertThat(end_byte, !IsLessThan(last_child_end_byte));
+    AssertThat(end_point, !IsLessThan(last_child_end_point));
+  }
+
+  if (some_child_has_changes) {
+    AssertThat(ts_node_has_changes(node), IsTrue());
+  }
+}
+
+static void assert_correct_tree_size(TSDocument *document, string content) {
+  TSNode root_node = ts_document_root_node(document);
+  size_t expected_size = content.size();
+
+  // In the JSON grammar, the start rule (`_value`) is hidden, so the node
+  // returned from `ts_document_root_node` (e.g. an `object` node), does not
+  // actually point to the root of the tree. In this weird case, trailing
+  // whitespace is not included in the root node's size.
+  //
+  // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden?
+  if (ts_document_language(document) == load_real_language("json") &&
+      string(ts_node_type(root_node, document)) != "ERROR")
+    expected_size = content.find_last_not_of("\n ") + 1;
+
+  AssertThat(ts_node_end_byte(root_node), Equals(expected_size));
+  assert_consistent_sizes(root_node);
+}
+
+START_TEST
+
+vector<string> test_languages({
+  "javascript",
+  "json",
+  "c",
+  "cpp",
+  "python",
+});
+
+for (auto &language_name : test_languages) {
+  describe(("the " + language_name + " language").c_str(), [&]() {
+    TSDocument *document;
+
+    before_each([&]() {
+      record_alloc::start();
+      document = ts_document_new();
+      ts_document_set_language(document, load_real_language(language_name));
+
+      // ts_document_set_logger(document, stderr_logger_new(true));
+      // ts_document_print_debugging_graphs(document, true);
+    });
+
+    after_each([&]() {
+      ts_document_free(document);
+      AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+    });
+
+    for (auto &entry : read_real_language_corpus(language_name)) {
+      SpyInput *input;
+
+      auto it_handles_edit_sequence = [&](string name, std::function<void()> edit_sequence){
+        it(("parses " + entry.description + ": " + name).c_str(), [&]() {
+          input = new SpyInput(entry.input, 3);
+          ts_document_set_input(document, input->input());
+          edit_sequence();
+
+          TSNode root_node = ts_document_root_node(document);
+          const char *node_string = ts_node_string(root_node, document);
+          string result(node_string);
+          ts_free((void *)node_string);
+          AssertThat(result, Equals(entry.tree_string));
+
+          assert_correct_tree_size(document, input->content);
+          delete input;
+        });
+      };
+
+      it_handles_edit_sequence("initial parse", [&]() {
+        ts_document_parse(document);
+      });
+
+      std::set<std::pair<size_t, size_t>> deletions;
+      std::set<std::pair<size_t, string>> insertions;
+
+      for (size_t i = 0; i < 60; i++) {
+        size_t edit_position = random() % utf8_char_count(entry.input);
+        size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position);
+        string inserted_text = random_words(random() % 4 + 1);
+
+        if (insertions.insert({edit_position, inserted_text}).second) {
+          string description = "\"" + inserted_text + "\" at " + to_string(edit_position);
+
+          it_handles_edit_sequence("repairing an insertion of " + description, [&]() {
+            ts_document_edit(document, input->replace(edit_position, 0, inserted_text));
+            ts_document_parse(document);
+            assert_correct_tree_size(document, input->content);
+
+            ts_document_edit(document, input->undo());
+            assert_correct_tree_size(document, input->content);
+
+            TSRange *ranges;
+            uint32_t range_count;
+            ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
+            ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
+
+            ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
+            verify_changed_ranges(old_scope_sequence, new_scope_sequence,
+                                  input->content, ranges, range_count);
+            ts_free(ranges);
+          });
+        }
+
+        if (deletions.insert({edit_position, deletion_size}).second) {
+          string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size);
+
+          it_handles_edit_sequence("repairing a deletion of " + desription, [&]() {
+            ts_document_edit(document, input->replace(edit_position, deletion_size, ""));
+            ts_document_parse(document);
+            assert_correct_tree_size(document, input->content);
+
+            ts_document_edit(document, input->undo());
+            assert_correct_tree_size(document, input->content);
+
+            TSRange *ranges;
+            uint32_t range_count;
+            ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
+            ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
+
+            ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
+            verify_changed_ranges(old_scope_sequence, new_scope_sequence,
+                                  input->content, ranges, range_count);
+            ts_free(ranges);
+          });
+        }
+      }
+    }
+  });
+}
+
+END_TEST
--- a/test/integration/test_grammars.cc
+++ b/test/integration/test_grammars.cc
@ -0,0 +1,78 @@
+#include "test_helper.h"
+#include "helpers/read_test_entries.h"
+#include "helpers/load_language.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/file_helpers.h"
+#include "runtime/alloc.h"
+
+START_TEST
+
+string grammars_dir_path = "test/fixtures/test_grammars";
+vector<string> test_languages = list_directory(grammars_dir_path);
+
+for (auto &language_name : test_languages) {
+  if (language_name == "readme.md") continue;
+
+  describe(("test language: " + language_name).c_str(), [&]() {
+    string directory_path = grammars_dir_path + "/" + language_name;
+    string grammar_path = directory_path + "/grammar.json";
+    string external_scanner_path = directory_path + "/scanner.c";
+    string expected_error_path = directory_path + "/expected_error.txt";
+    string corpus_path = directory_path + "/corpus.txt";
+
+    if (!file_exists(external_scanner_path)) {
+      external_scanner_path = "";
+    }
+
+    string grammar_json = read_file(grammar_path);
+    TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str());
+
+    if (file_exists(expected_error_path)) {
+      it("fails with the correct error message", [&]() {
+        string expected_error = read_file(expected_error_path);
+        AssertThat((void *)compile_result.error_message, !IsNull());
+        AssertThat(compile_result.error_message, Equals(expected_error));
+      });
+
+      return;
+    } else {
+      TSDocument *document = nullptr;
+      const TSLanguage *language = nullptr;
+
+      before_each([&]() {
+        if (!language) {
+          language = load_test_language(
+            language_name,
+            compile_result,
+            external_scanner_path
+          );
+        }
+
+        document = ts_document_new();
+        ts_document_set_language(document, language);
+
+        // ts_document_set_logger(document, stderr_logger_new(true));
+        // ts_document_print_debugging_graphs(document, true);
+      });
+
+      after_each([&]() {
+        if (document) ts_document_free(document);
+      });
+
+      for (auto &entry : read_test_language_corpus(language_name)) {
+        it(("parses " + entry.description).c_str(), [&]() {
+          ts_document_set_input_string_with_length(document, entry.input.c_str(), entry.input.size());
+          ts_document_parse(document);
+
+          TSNode root_node = ts_document_root_node(document);
+          const char *node_string = ts_node_string(root_node, document);
+          string result(node_string);
+          ts_free((void *)node_string);
+          AssertThat(result, Equals(entry.tree_string));
+        });
+      }
+    }
+  });
+}
+
+END_TEST
--- a/test/runtime/document_test.cc
+++ b/test/runtime/document_test.cc
@ -0,0 +1,372 @@
+#include "test_helper.h"
+#include "runtime/alloc.h"
+#include "helpers/record_alloc.h"
+#include "helpers/stream_methods.h"
+#include "helpers/tree_helpers.h"
+#include "helpers/point_helpers.h"
+#include "helpers/spy_logger.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/spy_input.h"
+#include "helpers/load_language.h"
+
+TSPoint point(size_t row, size_t column) {
+  return TSPoint{static_cast<uint32_t>(row), static_cast<uint32_t>(column)};
+}
+
+START_TEST
+
+describe("Document", [&]() {
+  TSDocument *document;
+  TSNode root;
+
+  before_each([&]() {
+    record_alloc::start();
+    document = ts_document_new();
+  });
+
+  after_each([&]() {
+    ts_document_free(document);
+    record_alloc::stop();
+    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+  });
+
+  auto assert_node_string_equals = [&](TSNode node, const string &expected) {
+    char *str = ts_node_string(node, document);
+    string actual(str);
+    ts_free(str);
+    AssertThat(actual, Equals(expected));
+  };
+
+  describe("set_input(input)", [&]() {
+    SpyInput *spy_input;
+
+    before_each([&]() {
+      spy_input = new SpyInput("{\"key\": [null, 2]}", 3);
+
+      ts_document_set_language(document, load_real_language("json"));
+      ts_document_set_input_string(document, "{\"key\": [1, 2]}");
+      ts_document_parse(document);
+
+      root = ts_document_root_node(document);
+      assert_node_string_equals(
+        root,
+        "(object (pair (string) (array (number) (number))))");
+    });
+
+    after_each([&]() {
+      delete spy_input;
+    });
+
+    it("handles both UTF8 and UTF16 encodings", [&]() {
+      const char16_t content[] = u"[true, false]";
+      spy_input->content = string((const char *)content, sizeof(content));
+      spy_input->encoding = TSInputEncodingUTF16;
+
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
+
+      root = ts_document_root_node(document);
+      assert_node_string_equals(
+        root,
+        "(array (true) (false))");
+    });
+
+    it("allows columns to be measured in either bytes or characters", [&]() {
+      const char16_t content[] = u"[true, false]";
+      spy_input->content = string((const char *)content, sizeof(content));
+      spy_input->encoding = TSInputEncodingUTF16;
+      // spy_input->measure_columns_in_bytes
+
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
+    });
+
+    it("allows the input to be retrieved later", [&]() {
+      ts_document_set_input(document, spy_input->input());
+      AssertThat(ts_document_input(document).payload, Equals<void *>(spy_input));
+      AssertThat(ts_document_input(document).read, Equals(spy_input->input().read));
+      AssertThat(ts_document_input(document).seek, Equals(spy_input->input().seek));
+    });
+
+    it("does not assume that the document's text has changed", [&]() {
+      ts_document_set_input(document, spy_input->input());
+      AssertThat(ts_document_root_node(document), Equals<TSNode>(root));
+      AssertThat(ts_node_has_changes(root), IsFalse());
+      AssertThat(spy_input->strings_read, Equals(vector<string>({ "" })));
+    });
+
+    it("reads text from the new input for future parses", [&]() {
+      ts_document_set_input(document, spy_input->input());
+
+      // Insert 'null', delete '1'.
+      TSInputEdit edit = {};
+      edit.start_point.column = edit.start_byte = strlen("{\"key\": [");
+      edit.extent_added.column = edit.bytes_added = 4;
+      edit.extent_removed.column = edit.bytes_removed = 1;
+
+      ts_document_edit(document, edit);
+      ts_document_parse(document);
+
+      TSNode new_root = ts_document_root_node(document);
+      assert_node_string_equals(
+        new_root,
+        "(object (pair (string) (array (null) (number))))");
+      AssertThat(spy_input->strings_read, Equals(vector<string>({" [null, 2" })));
+    });
+
+    it("allows setting input string with length", [&]() {
+      const char content[] = { '1' };
+      ts_document_set_input_string_with_length(document, content, 1);
+      ts_document_parse(document);
+      TSNode new_root = ts_document_root_node(document);
+      AssertThat(ts_node_end_char(new_root), Equals<size_t>(1));
+      assert_node_string_equals(
+        new_root,
+        "(number)");
+    });
+
+    it("reads from the new input correctly when the old input was blank", [&]() {
+      ts_document_set_input_string(document, "");
+      ts_document_parse(document);
+      TSNode new_root = ts_document_root_node(document);
+      AssertThat(ts_node_end_char(new_root), Equals<size_t>(0));
+      assert_node_string_equals(
+        new_root,
+        "(ERROR)");
+
+      ts_document_set_input_string(document, "1");
+      ts_document_parse(document);
+      new_root = ts_document_root_node(document);
+      AssertThat(ts_node_end_char(new_root), Equals<size_t>(1));
+      assert_node_string_equals(
+        new_root,
+        "(number)");
+    });
+  });
+
+  describe("set_language(language)", [&]() {
+    before_each([&]() {
+      ts_document_set_input_string(document, "{\"key\": [1, 2]}\n");
+    });
+
+    it("uses the given language for future parses", [&]() {
+      ts_document_set_language(document, load_real_language("json"));
+      ts_document_parse(document);
+
+      root = ts_document_root_node(document);
+      assert_node_string_equals(
+        root,
+        "(object (pair (string) (array (number) (number))))");
+    });
+
+    it("clears out any previous tree", [&]() {
+      ts_document_set_language(document, load_real_language("json"));
+      ts_document_parse(document);
+
+      ts_document_set_language(document, load_real_language("javascript"));
+      AssertThat(ts_document_root_node(document).data, Equals<void *>(nullptr));
+
+      ts_document_parse(document);
+      root = ts_document_root_node(document);
+      assert_node_string_equals(
+        root,
+        "(program (expression_statement "
+          "(object (pair (string) (array (number) (number))))))");
+    });
+
+    it("does not allow setting a language with a different version number", [&]() {
+      TSLanguage language = *load_real_language("json");
+      AssertThat(ts_language_version(&language), Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));
+
+      language.version++;
+      AssertThat(ts_language_version(&language), !Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));
+
+      ts_document_set_language(document, &language);
+      AssertThat(ts_document_language(document), IsNull());
+    });
+  });
+
+  describe("set_logger(TSLogger)", [&]() {
+    SpyLogger *logger;
+
+    before_each([&]() {
+      logger = new SpyLogger();
+      ts_document_set_language(document, load_real_language("json"));
+      ts_document_set_input_string(document, "[1, 2]");
+    });
+
+    after_each([&]() {
+      delete logger;
+    });
+
+    it("calls the debugger with a message for each parse action", [&]() {
+      ts_document_set_logger(document, logger->logger());
+      ts_document_parse(document);
+
+      AssertThat(logger->messages, Contains("new_parse"));
+      AssertThat(logger->messages, Contains("skip character:' '"));
+      AssertThat(logger->messages, Contains("consume character:'['"));
+      AssertThat(logger->messages, Contains("consume character:'1'"));
+      AssertThat(logger->messages, Contains("reduce sym:array, child_count:4"));
+      AssertThat(logger->messages, Contains("accept"));
+    });
+
+    it("allows the debugger to be retrieved later", [&]() {
+      ts_document_set_logger(document, logger->logger());
+      AssertThat(ts_document_logger(document).payload, Equals(logger));
+    });
+
+    describe("disabling debugging", [&]() {
+      before_each([&]() {
+        ts_document_set_logger(document, logger->logger());
+        ts_document_set_logger(document, {NULL, NULL});
+      });
+
+      it("does not call the debugger any more", [&]() {
+        ts_document_parse(document);
+        AssertThat(logger->messages, IsEmpty());
+      });
+    });
+  });
+
+  describe("parse_and_get_changed_ranges()", [&]() {
+    SpyInput *input;
+
+    before_each([&]() {
+      ts_document_set_language(document, load_real_language("javascript"));
+      input = new SpyInput("{a: null};", 3);
+      ts_document_set_input(document, input->input());
+      ts_document_parse(document);
+      assert_node_string_equals(
+        ts_document_root_node(document),
+        "(program (expression_statement (object (pair (identifier) (null)))))");
+    });
+
+    after_each([&]() {
+      delete input;
+    });
+
+    auto get_invalidated_ranges_for_edit = [&](std::function<TSInputEdit()> callback) -> vector<TSRange> {
+      TSInputEdit edit = callback();
+      ts_document_edit(document, edit);
+
+      TSRange *ranges;
+      uint32_t range_count = 0;
+      ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
+
+      vector<TSRange> result;
+      for (size_t i = 0; i < range_count; i++) {
+        result.push_back(ranges[i]);
+      }
+      ts_free(ranges);
+      return result;
+    };
+
+    it("reports changes when one token has been updated", [&]() {
+      // Replace `null` with `nothing`
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->replace(input->content.find("ull"), 1, "othing");
+      });
+
+      AssertThat(ranges, Equals(vector<TSRange>({
+        TSRange{
+          point(0, input->content.find("nothing")),
+          point(0, input->content.find("}"))
+        },
+      })));
+
+      // Replace `nothing` with `null` again
+      ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->undo();
+      });
+
+      AssertThat(ranges, Equals(vector<TSRange>({
+        TSRange{
+          point(0, input->content.find("null")),
+          point(0, input->content.find("}"))
+        },
+      })));
+    });
+
+    it("reports changes when tokens have been appended", [&]() {
+      // Add a second key-value pair
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->replace(input->content.find("}"), 0, ", b: false");
+      });
+
+      AssertThat(ranges, Equals(vector<TSRange>({
+        TSRange{
+          point(0, input->content.find(",")),
+          point(0, input->content.find("}"))
+        },
+      })));
+
+      // Add a third key-value pair in between the first two
+      ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->replace(input->content.find(", b"), 0, ", c: 1");
+      });
+
+      assert_node_string_equals(
+        ts_document_root_node(document),
+        "(program (expression_statement (object "
+          "(pair (identifier) (null)) "
+          "(pair (identifier) (number)) "
+          "(pair (identifier) (false)))))");
+
+      AssertThat(ranges, Equals(vector<TSRange>({
+        TSRange{
+          point(0, input->content.find(", c")),
+          point(0, input->content.find(", b"))
+        },
+      })));
+
+      // Delete the middle pair.
+      ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->undo();
+      });
+
+      assert_node_string_equals(
+        ts_document_root_node(document),
+        "(program (expression_statement (object "
+          "(pair (identifier) (null)) "
+          "(pair (identifier) (false)))))");
+
+      AssertThat(ranges, IsEmpty());
+
+      // Delete the second pair.
+      ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->undo();
+      });
+
+      assert_node_string_equals(
+        ts_document_root_node(document),
+        "(program (expression_statement (object "
+          "(pair (identifier) (null)))))");
+
+      AssertThat(ranges, IsEmpty());
+    });
+
+    it("reports changes when trees have been wrapped", [&]() {
+      // Wrap the object in an assignment expression.
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
+        return input->replace(input->content.find("null"), 0, "b === ");
+      });
+
+      assert_node_string_equals(
+        ts_document_root_node(document),
+        "(program (expression_statement (object "
+          "(pair (identifier) (rel_op (identifier) (null))))))");
+
+      AssertThat(ranges, Equals(vector<TSRange>({
+        TSRange{
+          point(0, input->content.find("b ===")),
+          point(0, input->content.find("}"))
+        },
+      })));
+    });
+  });
+});
+
+END_TEST
--- a/test/runtime/node_test.cc
+++ b/test/runtime/node_test.cc
@ -0,0 +1,436 @@
+#include "test_helper.h"
+#include "runtime/alloc.h"
+#include "helpers/tree_helpers.h"
+#include "helpers/point_helpers.h"
+#include "helpers/load_language.h"
+#include "helpers/record_alloc.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+describe("Node", []() {
+  TSDocument *document;
+  TSNode array_node;
+  string input_string =
+    "\n"
+    "\n"
+    "[\n"
+    "  123,\n"
+    "  false,\n"
+    "  {\n"
+    "    \"x\": null\n"
+    "  }\n"
+    "]";
+
+  size_t array_index = input_string.find("[\n");
+  size_t array_end_index = input_string.find("]") + 1;
+  size_t number_index = input_string.find("123");
+  size_t number_end_index = number_index + string("123").size();
+  size_t false_index = input_string.find("false");
+  size_t false_end_index = false_index + string("false").size();
+  size_t object_index = input_string.find("{");
+  size_t object_end_index = input_string.find("}") + 1;
+  size_t string_index = input_string.find("\"x\"");
+  size_t string_end_index = string_index + 3;
+  size_t colon_index = input_string.find(":");
+  size_t null_index = input_string.find("null");
+  size_t null_end_index = null_index + string("null").size();
+
+  before_each([&]() {
+    record_alloc::start();
+
+    document = ts_document_new();
+    ts_document_set_language(document, load_real_language("json"));
+    ts_document_set_input_string(document, input_string.c_str());
+    ts_document_parse(document);
+
+    array_node = ts_document_root_node(document);
+    char *node_string = ts_node_string(array_node, document);
+    AssertThat(node_string, Equals(
+      "(array "
+        "(number) "
+        "(false) "
+        "(object (pair (string) (null))))"));
+    ts_free(node_string);
+  });
+
+  after_each([&]() {
+    ts_document_free(document);
+
+    record_alloc::stop();
+    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+  });
+
+  describe("named_child_count(), named_child(i)", [&]() {
+    it("returns the named child node at the given index", [&]() {
+      AssertThat(ts_node_type(array_node, document), Equals("array"));
+
+      AssertThat(ts_node_named_child_count(array_node), Equals<size_t>(3));
+      AssertThat(ts_node_start_byte(array_node), Equals(array_index));
+      AssertThat(ts_node_end_byte(array_node), Equals(array_end_index));
+      AssertThat(ts_node_start_char(array_node), Equals(array_index));
+      AssertThat(ts_node_end_char(array_node), Equals(array_end_index));
+      AssertThat(ts_node_start_point(array_node), Equals<TSPoint>({ 2, 0 }));
+      AssertThat(ts_node_end_point(array_node), Equals<TSPoint>({ 8, 1 }));
+
+      TSNode number_node = ts_node_named_child(array_node, 0);
+      TSNode false_node = ts_node_named_child(array_node, 1);
+      TSNode object_node = ts_node_named_child(array_node, 2);
+
+      AssertThat(ts_node_type(number_node, document), Equals("number"));
+      AssertThat(ts_node_type(false_node, document), Equals("false"));
+      AssertThat(ts_node_type(object_node, document), Equals("object"));
+
+      AssertThat(ts_node_start_byte(number_node), Equals(number_index));
+      AssertThat(ts_node_end_byte(number_node), Equals(number_end_index));
+      AssertThat(ts_node_start_char(number_node), Equals(number_index));
+      AssertThat(ts_node_end_char(number_node), Equals(number_end_index));
+      AssertThat(ts_node_start_point(number_node), Equals<TSPoint>({ 3, 2 }));
+      AssertThat(ts_node_end_point(number_node), Equals<TSPoint>({ 3, 5 }));
+
+      AssertThat(ts_node_start_byte(false_node), Equals(false_index));
+      AssertThat(ts_node_end_byte(false_node), Equals(false_end_index));
+      AssertThat(ts_node_start_point(false_node), Equals<TSPoint>({ 4, 2 }));
+      AssertThat(ts_node_end_point(false_node), Equals<TSPoint>({ 4, 7 }));
+
+      AssertThat(ts_node_start_byte(object_node), Equals(object_index));
+      AssertThat(ts_node_end_byte(object_node), Equals(object_end_index));
+      AssertThat(ts_node_start_point(object_node), Equals<TSPoint>({ 5, 2 }));
+      AssertThat(ts_node_end_point(object_node), Equals<TSPoint>({ 7, 3 }));
+      AssertThat(ts_node_named_child_count(object_node), Equals<size_t>(1));
+
+      TSNode pair_node = ts_node_named_child(object_node, 0);
+
+      AssertThat(ts_node_type(pair_node, document), Equals("pair"));
+      AssertThat(ts_node_start_byte(pair_node), Equals(string_index));
+      AssertThat(ts_node_end_byte(pair_node), Equals(null_end_index));
+      AssertThat(ts_node_start_point(pair_node), Equals<TSPoint>({ 6, 4 }));
+      AssertThat(ts_node_end_point(pair_node), Equals<TSPoint>({ 6, 13 }));
+      AssertThat(ts_node_named_child_count(pair_node), Equals<size_t>(2));
+
+      TSNode string_node = ts_node_named_child(pair_node, 0);
+      TSNode null_node = ts_node_named_child(pair_node, 1);
+
+      AssertThat(ts_node_type(string_node, document), Equals("string"));
+      AssertThat(ts_node_type(null_node, document), Equals("null"));
+
+      AssertThat(ts_node_start_byte(string_node), Equals(string_index));
+      AssertThat(ts_node_end_byte(string_node), Equals(string_end_index));
+      AssertThat(ts_node_start_point(string_node), Equals<TSPoint>({ 6, 4 }));
+      AssertThat(ts_node_end_point(string_node), Equals<TSPoint>({ 6, 7 }));
+
+      AssertThat(ts_node_start_byte(null_node), Equals(null_index));
+      AssertThat(ts_node_end_byte(null_node), Equals(null_end_index));
+      AssertThat(ts_node_start_point(null_node), Equals<TSPoint>({ 6, 9 }));
+      AssertThat(ts_node_end_point(null_node), Equals<TSPoint>({ 6, 13 }));
+
+      AssertThat(ts_node_parent(string_node), Equals(pair_node));
+      AssertThat(ts_node_parent(null_node), Equals(pair_node));
+      AssertThat(ts_node_parent(pair_node), Equals(object_node));
+      AssertThat(ts_node_parent(number_node), Equals(array_node));
+      AssertThat(ts_node_parent(false_node), Equals(array_node));
+      AssertThat(ts_node_parent(object_node), Equals(array_node));
+      AssertThat(ts_node_parent(array_node).data, Equals<void *>(nullptr));
+    });
+  });
+
+  describe("symbols()", [&]() {
+    it("returns an iterator that yields each of the node's symbols", [&]() {
+      const TSLanguage *language = ts_document_language(document);
+
+      TSNode false_node = ts_node_descendant_for_char_range(array_node, false_index, false_index + 1);
+      TSSymbolIterator iterator = ts_node_symbols(false_node);
+      AssertThat(iterator.done, Equals(false));
+      AssertThat(ts_language_symbol_name(language, iterator.value), Equals("false"));
+
+      ts_symbol_iterator_next(&iterator);
+      AssertThat(iterator.done, Equals(false));
+      AssertThat(ts_language_symbol_name(language, iterator.value), Equals("_value"));
+
+      ts_symbol_iterator_next(&iterator);
+      AssertThat(iterator.done, Equals(true));
+
+      TSNode comma_node = ts_node_descendant_for_char_range(array_node, number_end_index, number_end_index);
+      iterator = ts_node_symbols(comma_node);
+      AssertThat(iterator.done, Equals(false));
+      AssertThat(ts_language_symbol_name(language, iterator.value), Equals(","));
+
+      ts_symbol_iterator_next(&iterator);
+      AssertThat(iterator.done, Equals(true));
+    });
+  });
+
+  describe("child_count(), child(i)", [&]() {
+    it("returns the child node at the given index, including anonymous nodes", [&]() {
+      AssertThat(ts_node_child_count(array_node), Equals<size_t>(7));
+      TSNode child1 = ts_node_child(array_node, 0);
+      TSNode child2 = ts_node_child(array_node, 1);
+      TSNode child3 = ts_node_child(array_node, 2);
+      TSNode child4 = ts_node_child(array_node, 3);
+      TSNode child5 = ts_node_child(array_node, 4);
+      TSNode child6 = ts_node_child(array_node, 5);
+      TSNode child7 = ts_node_child(array_node, 6);
+
+      AssertThat(ts_node_type(array_node, document), Equals("array"));
+      AssertThat(ts_node_type(child1, document), Equals("["));
+      AssertThat(ts_node_type(child2, document), Equals("number"));
+      AssertThat(ts_node_type(child3, document), Equals(","));
+      AssertThat(ts_node_type(child4, document), Equals("false"));
+      AssertThat(ts_node_type(child5, document), Equals(","));
+      AssertThat(ts_node_type(child6, document), Equals("object"));
+      AssertThat(ts_node_type(child7, document), Equals("]"));
+
+      AssertThat(ts_node_is_named(array_node), IsTrue());
+      AssertThat(ts_node_is_named(child1), IsFalse());
+      AssertThat(ts_node_is_named(child2), IsTrue());
+      AssertThat(ts_node_is_named(child3), IsFalse());
+      AssertThat(ts_node_is_named(child4), IsTrue());
+      AssertThat(ts_node_is_named(child5), IsFalse());
+      AssertThat(ts_node_is_named(child6), IsTrue());
+      AssertThat(ts_node_is_named(child7), IsFalse());
+
+      AssertThat(ts_node_start_byte(child1), Equals(array_index));
+      AssertThat(ts_node_end_byte(child1), Equals(array_index + 1));
+      AssertThat(ts_node_start_point(child1), Equals<TSPoint>({ 2, 0 }));
+      AssertThat(ts_node_end_point(child1), Equals<TSPoint>({ 2, 1 }));
+
+      AssertThat(ts_node_start_byte(child3), Equals(number_end_index));
+      AssertThat(ts_node_end_byte(child3), Equals(number_end_index + 1));
+      AssertThat(ts_node_start_point(child3), Equals<TSPoint>({ 3, 5 }));
+      AssertThat(ts_node_end_point(child3), Equals<TSPoint>({ 3, 6 }));
+
+      AssertThat(ts_node_start_byte(child5), Equals(false_end_index));
+      AssertThat(ts_node_end_byte(child5), Equals(false_end_index + 1));
+      AssertThat(ts_node_start_point(child5), Equals<TSPoint>({ 4, 7 }));
+      AssertThat(ts_node_end_point(child5), Equals<TSPoint>({ 4, 8 }));
+
+      AssertThat(ts_node_start_byte(child7), Equals(array_end_index - 1));
+      AssertThat(ts_node_end_byte(child7), Equals(array_end_index));
+      AssertThat(ts_node_start_point(child7), Equals<TSPoint>({ 8, 0 }));
+      AssertThat(ts_node_end_point(child7), Equals<TSPoint>({ 8, 1 }));
+
+      AssertThat(ts_node_child_count(child6), Equals<size_t>(3))
+
+      TSNode left_brace = ts_node_child(child6, 0);
+      TSNode pair = ts_node_child(child6, 1);
+      TSNode right_brace = ts_node_child(child6, 2);
+
+      TSNode grandchild2 = ts_node_child(pair, 0);
+      TSNode grandchild3 = ts_node_child(pair, 1);
+      TSNode grandchild4 = ts_node_child(pair, 2);
+
+      AssertThat(ts_node_type(left_brace, document), Equals("{"));
+      AssertThat(ts_node_type(pair, document), Equals("pair"));
+      AssertThat(ts_node_type(right_brace, document), Equals("}"));
+
+      AssertThat(ts_node_type(grandchild2, document), Equals("string"));
+      AssertThat(ts_node_type(grandchild3, document), Equals(":"));
+      AssertThat(ts_node_type(grandchild4, document), Equals("null"));
+
+      AssertThat(ts_node_parent(grandchild2), Equals(pair));
+      AssertThat(ts_node_parent(grandchild3), Equals(pair));
+      AssertThat(ts_node_parent(grandchild4), Equals(pair));
+      AssertThat(ts_node_parent(left_brace), Equals(child6));
+      AssertThat(ts_node_parent(pair), Equals(child6));
+      AssertThat(ts_node_parent(right_brace), Equals(child6));
+      AssertThat(ts_node_parent(child1), Equals(array_node));
+      AssertThat(ts_node_parent(child2), Equals(array_node));
+      AssertThat(ts_node_parent(child3), Equals(array_node));
+      AssertThat(ts_node_parent(child4), Equals(array_node));
+      AssertThat(ts_node_parent(child5), Equals(array_node));
+      AssertThat(ts_node_parent(child6), Equals(array_node));
+      AssertThat(ts_node_parent(child7), Equals(array_node));
+      AssertThat(ts_node_parent(array_node).data, Equals<void *>(nullptr));
+    });
+  });
+
+  describe("next_sibling(), prev_sibling()", [&]() {
+    it("returns the node's next and previous sibling, including anonymous nodes", [&]() {
+      TSNode bracket_node1 = ts_node_child(array_node, 0);
+      TSNode number_node = ts_node_child(array_node, 1);
+      TSNode array_comma_node1 = ts_node_child(array_node, 2);
+      TSNode false_node = ts_node_child(array_node, 3);
+      TSNode array_comma_node2 = ts_node_child(array_node, 4);
+      TSNode object_node = ts_node_child(array_node, 5);
+      TSNode brace_node1 = ts_node_child(object_node, 0);
+      TSNode pair_node = ts_node_child(object_node, 1);
+      TSNode string_node = ts_node_child(pair_node, 0);
+      TSNode colon_node = ts_node_child(pair_node, 1);
+      TSNode null_node = ts_node_child(pair_node, 2);
+      TSNode brace_node2 = ts_node_child(object_node, 2);
+      TSNode bracket_node2 = ts_node_child(array_node, 6);
+
+      AssertThat(ts_node_next_sibling(bracket_node1), Equals(number_node));
+      AssertThat(ts_node_next_sibling(number_node), Equals(array_comma_node1));
+      AssertThat(ts_node_next_sibling(array_comma_node1), Equals(false_node));
+      AssertThat(ts_node_next_sibling(false_node), Equals(array_comma_node2));
+      AssertThat(ts_node_next_sibling(array_comma_node2), Equals(object_node));
+      AssertThat(ts_node_next_sibling(object_node), Equals(bracket_node2));
+      AssertThat(ts_node_next_sibling(bracket_node2).data, Equals<void *>(nullptr));
+
+      AssertThat(ts_node_prev_sibling(bracket_node1).data, Equals<void *>(nullptr));
+      AssertThat(ts_node_prev_sibling(number_node), Equals(bracket_node1));
+      AssertThat(ts_node_prev_sibling(array_comma_node1), Equals(number_node));
+      AssertThat(ts_node_prev_sibling(false_node), Equals(array_comma_node1));
+      AssertThat(ts_node_prev_sibling(array_comma_node2), Equals(false_node));
+      AssertThat(ts_node_prev_sibling(object_node), Equals(array_comma_node2));
+      AssertThat(ts_node_prev_sibling(bracket_node2), Equals(object_node));
+
+      AssertThat(ts_node_next_sibling(brace_node1), Equals(pair_node));
+      AssertThat(ts_node_next_sibling(pair_node), Equals(brace_node2));
+      AssertThat(ts_node_next_sibling(brace_node2).data, Equals<void *>(nullptr));
+
+      AssertThat(ts_node_prev_sibling(brace_node1).data, Equals<void *>(nullptr));
+      AssertThat(ts_node_prev_sibling(pair_node), Equals(brace_node1));
+      AssertThat(ts_node_prev_sibling(brace_node2), Equals(pair_node));
+
+      AssertThat(ts_node_next_sibling(string_node), Equals(colon_node));
+      AssertThat(ts_node_next_sibling(colon_node), Equals(null_node));
+      AssertThat(ts_node_next_sibling(null_node).data, Equals<void *>(nullptr));
+
+      AssertThat(ts_node_prev_sibling(string_node).data, Equals<void *>(nullptr));
+      AssertThat(ts_node_prev_sibling(colon_node), Equals(string_node));
+      AssertThat(ts_node_prev_sibling(null_node), Equals(colon_node));
+    });
+
+    it("returns null when the node has no parent", [&]() {
+      AssertThat(ts_node_next_named_sibling(array_node).data, Equals<void *>(nullptr));
+      AssertThat(ts_node_prev_named_sibling(array_node).data, Equals<void *>(nullptr));
+    });
+  });
+
+  describe("next_named_sibling(), prev_named_sibling()", [&]() {
+    it("returns the node's next and previous siblings", [&]() {
+      TSNode number_node = ts_node_named_child(array_node, 0);
+      TSNode false_node = ts_node_named_child(array_node, 1);
+      TSNode object_node = ts_node_named_child(array_node, 2);
+      TSNode pair_node = ts_node_named_child(object_node, 0);
+      TSNode string_node = ts_node_named_child(pair_node, 0);
+      TSNode null_node = ts_node_named_child(pair_node, 1);
+
+      AssertThat(ts_node_next_named_sibling(number_node), Equals(false_node));
+      AssertThat(ts_node_next_named_sibling(false_node), Equals(object_node));
+      AssertThat(ts_node_next_named_sibling(string_node), Equals(null_node));
+      AssertThat(ts_node_prev_named_sibling(object_node), Equals(false_node));
+      AssertThat(ts_node_prev_named_sibling(false_node), Equals(number_node));
+      AssertThat(ts_node_prev_named_sibling(null_node), Equals(string_node));
+    });
+
+    it("returns null when the node has no parent", [&]() {
+      AssertThat(ts_node_next_named_sibling(array_node).data, Equals<void *>(nullptr));
+      AssertThat(ts_node_prev_named_sibling(array_node).data, Equals<void *>(nullptr));
+    });
+  });
+
+  describe("named_descendant_for_char_range(start, end)", [&]() {
+    describe("when there is a leaf node that spans the given range exactly", [&]() {
+      it("returns that leaf node", [&]() {
+        TSNode leaf = ts_node_named_descendant_for_char_range(array_node, string_index, string_end_index - 1);
+        AssertThat(ts_node_type(leaf, document), Equals("string"));
+        AssertThat(ts_node_start_byte(leaf), Equals(string_index));
+        AssertThat(ts_node_end_byte(leaf), Equals(string_end_index));
+        AssertThat(ts_node_start_point(leaf), Equals<TSPoint>({ 6, 4 }));
+        AssertThat(ts_node_end_point(leaf), Equals<TSPoint>({ 6, 7 }));
+
+        leaf = ts_node_named_descendant_for_char_range(array_node, number_index, number_end_index - 1);
+        AssertThat(ts_node_type(leaf, document), Equals("number"));
+        AssertThat(ts_node_start_byte(leaf), Equals(number_index));
+        AssertThat(ts_node_end_byte(leaf), Equals(number_end_index));
+        AssertThat(ts_node_start_point(leaf), Equals<TSPoint>({ 3, 2 }));
+        AssertThat(ts_node_end_point(leaf), Equals<TSPoint>({ 3, 5 }));
+      });
+    });
+
+    describe("when there is a leaf node that extends beyond the given range", [&]() {
+      it("returns that leaf node", [&]() {
+        TSNode leaf = ts_node_named_descendant_for_char_range(array_node, string_index, string_index + 1);
+        AssertThat(ts_node_type(leaf, document), Equals("string"));
+        AssertThat(ts_node_start_byte(leaf), Equals(string_index));
+        AssertThat(ts_node_end_byte(leaf), Equals(string_end_index));
+        AssertThat(ts_node_start_point(leaf), Equals<TSPoint>({ 6, 4 }));
+        AssertThat(ts_node_end_point(leaf), Equals<TSPoint>({ 6, 7 }));
+
+        leaf = ts_node_named_descendant_for_char_range(array_node, string_index + 1, string_index + 2);
+        AssertThat(ts_node_type(leaf, document), Equals("string"));
+        AssertThat(ts_node_start_byte(leaf), Equals(string_index));
+        AssertThat(ts_node_end_byte(leaf), Equals(string_end_index));
+        AssertThat(ts_node_start_point(leaf), Equals<TSPoint>({ 6, 4 }));
+        AssertThat(ts_node_end_point(leaf), Equals<TSPoint>({ 6, 7 }));
+      });
+    });
+
+    describe("when there is no leaf node that spans the given range", [&]() {
+      it("returns the smallest node that does span the range", [&]() {
+        TSNode pair_node = ts_node_named_descendant_for_char_range(array_node, string_index, string_index + 3);
+        AssertThat(ts_node_type(pair_node, document), Equals("pair"));
+        AssertThat(ts_node_start_byte(pair_node), Equals(string_index));
+        AssertThat(ts_node_end_byte(pair_node), Equals(null_end_index));
+        AssertThat(ts_node_start_point(pair_node), Equals<TSPoint>({ 6, 4 }));
+        AssertThat(ts_node_end_point(pair_node), Equals<TSPoint>({ 6, 13 }));
+      });
+
+      it("does not return invisible nodes (repeats)", [&]() {
+        TSNode node = ts_node_named_descendant_for_char_range(array_node, number_end_index, number_end_index + 1);
+        AssertThat(ts_node_type(node, document), Equals("array"));
+        AssertThat(ts_node_start_byte(node), Equals(array_index));
+        AssertThat(ts_node_end_byte(node), Equals(array_end_index));
+        AssertThat(ts_node_start_point(node), Equals<TSPoint>({ 2, 0 }));
+        AssertThat(ts_node_end_point(node), Equals<TSPoint>({ 8, 1 }));
+      });
+    });
+  });
+
+  describe("descendant_for_char_range(start, end)", [&]() {
+    it("returns the smallest node that spans the given range", [&]() {
+      TSNode node1 = ts_node_descendant_for_char_range(array_node, colon_index, colon_index);
+      AssertThat(ts_node_type(node1, document), Equals(":"));
+      AssertThat(ts_node_start_byte(node1), Equals(colon_index));
+      AssertThat(ts_node_end_byte(node1), Equals(colon_index + 1));
+      AssertThat(ts_node_start_point(node1), Equals<TSPoint>({ 6, 7 }));
+      AssertThat(ts_node_end_point(node1), Equals<TSPoint>({ 6, 8 }));
+
+      TSNode node2 = ts_node_descendant_for_char_range(array_node, string_index + 2, string_index + 4);
+      AssertThat(ts_node_type(node2, document), Equals("pair"));
+      AssertThat(ts_node_start_byte(node2), Equals(string_index));
+      AssertThat(ts_node_end_byte(node2), Equals(null_end_index));
+      AssertThat(ts_node_start_point(node2), Equals<TSPoint>({ 6, 4 }));
+      AssertThat(ts_node_end_point(node2), Equals<TSPoint>({ 6, 13 }));
+    });
+  });
+
+  describe("descendant_for_byte_range(start, end)", [&]() {
+    it("returns the smallest concrete node that spans the given range", [&]() {
+      ts_document_set_input_string(document, "[\"αβγδ\", \"αβγδ\"]");
+      ts_document_parse(document);
+      TSNode array_node = ts_document_root_node(document);
+
+      TSNode node1 = ts_node_descendant_for_char_range(array_node, 7, 7);
+      AssertThat(ts_node_type(node1, document), Equals(","));
+
+      TSNode node2 = ts_node_descendant_for_byte_range(array_node, 6, 10);
+      AssertThat(ts_node_type(node2, document), Equals("string"));
+      AssertThat(ts_node_start_byte(node2), Equals<size_t>(1));
+      AssertThat(ts_node_end_byte(node2), Equals<size_t>(11));
+    });
+  });
+
+  describe("descendant_for_point_range(start, end)", [&]() {
+    it("returns the smallest concrete node that spans the given range", [&]() {
+      TSNode node1 = ts_node_descendant_for_point_range(array_node, {6, 7}, {6, 7});
+      AssertThat(ts_node_type(node1, document), Equals(":"));
+      AssertThat(ts_node_start_byte(node1), Equals(colon_index));
+      AssertThat(ts_node_end_byte(node1), Equals(colon_index + 1));
+      AssertThat(ts_node_start_point(node1), Equals<TSPoint>({ 6, 7 }));
+      AssertThat(ts_node_end_point(node1), Equals<TSPoint>({ 6, 8 }));
+
+      TSNode node2 = ts_node_descendant_for_point_range(array_node, {6, 6}, {6, 8});
+      AssertThat(ts_node_type(node2, document), Equals("pair"));
+      AssertThat(ts_node_start_byte(node2), Equals(string_index));
+      AssertThat(ts_node_end_byte(node2), Equals(null_end_index));
+      AssertThat(ts_node_start_point(node2), Equals<TSPoint>({ 6, 4 }));
+      AssertThat(ts_node_end_point(node2), Equals<TSPoint>({ 6, 13 }));
+    });
+  });
+});
+
+END_TEST
--- a/test/runtime/parser_test.cc
+++ b/test/runtime/parser_test.cc
@ -0,0 +1,479 @@
+#include "test_helper.h"
+#include "runtime/alloc.h"
+#include "helpers/record_alloc.h"
+#include "helpers/spy_input.h"
+#include "helpers/load_language.h"
+#include "helpers/record_alloc.h"
+#include "helpers/point_helpers.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/dedent.h"
+
+START_TEST
+
+describe("Parser", [&]() {
+  TSDocument *document;
+  SpyInput *input;
+  TSNode root;
+  size_t chunk_size;
+
+  before_each([&]() {
+    record_alloc::start();
+
+    chunk_size = 3;
+    input = nullptr;
+    document = ts_document_new();
+  });
+
+  after_each([&]() {
+    if (document) ts_document_free(document);
+    if (input) delete input;
+
+    record_alloc::stop();
+    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+  });
+
+  auto set_text = [&](string text) {
+    input = new SpyInput(text, chunk_size);
+    ts_document_set_input(document, input->input());
+    ts_document_parse(document);
+
+    root = ts_document_root_node(document);
+    AssertThat(ts_node_end_byte(root), Equals(text.size()));
+    input->clear();
+  };
+
+  auto replace_text = [&](size_t position, size_t length, string new_text) {
+    size_t prev_size = ts_node_end_byte(root);
+
+    ts_document_edit(document, input->replace(position, length, new_text));
+    ts_document_parse(document);
+
+    root = ts_document_root_node(document);
+    size_t new_size = ts_node_end_byte(root);
+    AssertThat(new_size, Equals(prev_size - length + new_text.size()));
+  };
+
+  auto insert_text = [&](size_t position, string text) {
+    replace_text(position, 0, text);
+  };
+
+  auto delete_text = [&](size_t position, size_t length) {
+    replace_text(position, length, "");
+  };
+
+  auto undo = [&]() {
+    ts_document_edit(document, input->undo());
+    ts_document_parse(document);
+  };
+
+  auto assert_root_node = [&](const string &expected) {
+    TSNode node = ts_document_root_node(document);
+    char *node_string = ts_node_string(node, document);
+    string actual(node_string);
+    ts_free(node_string);
+    AssertThat(actual, Equals(expected));
+  };
+
+  auto get_node_text = [&](TSNode node) {
+    size_t start = ts_node_start_byte(node);
+    size_t end = ts_node_end_byte(node);
+    return input->content.substr(start, end - start);
+  };
+
+  describe("handling errors", [&]() {
+    describe("when there is an invalid substring right before a valid token", [&]() {
+      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, load_real_language("json"));
+        set_text("  [123,  @@@@@,   true]");
+
+        assert_root_node(
+          "(array (number) (ERROR (UNEXPECTED '@')) (true))");
+
+        TSNode error = ts_node_named_child(root, 1);
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
+        AssertThat(get_node_text(error), Equals(",  @@@@@"));
+        AssertThat(ts_node_child_count(error), Equals<size_t>(2));
+
+        TSNode comma = ts_node_child(error, 0);
+        AssertThat(get_node_text(comma), Equals(","));
+
+        TSNode garbage = ts_node_child(error, 1);
+        AssertThat(get_node_text(garbage), Equals("@@@@@"));
+
+        TSNode node_after_error = ts_node_named_child(root, 2);
+        AssertThat(ts_node_type(node_after_error, document), Equals("true"));
+        AssertThat(get_node_text(node_after_error), Equals("true"));
+      });
+    });
+
+    describe("when there is an unexpected string in the middle of a token", [&]() {
+      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, load_real_language("json"));
+        set_text("  [123, faaaaalse, true]");
+
+        assert_root_node(
+          "(array (number) (ERROR (UNEXPECTED 'a')) (true))");
+
+        TSNode error = ts_node_named_child(root, 1);
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
+        AssertThat(ts_node_child_count(error), Equals<size_t>(2));
+
+        TSNode comma = ts_node_child(error, 0);
+        AssertThat(ts_node_type(comma, document), Equals(","));
+        AssertThat(get_node_text(comma), Equals(","));
+
+        TSNode garbage = ts_node_child(error, 1);
+        AssertThat(ts_node_type(garbage, document), Equals("ERROR"));
+        AssertThat(get_node_text(garbage), Equals("faaaaalse"));
+
+        TSNode last = ts_node_named_child(root, 2);
+        AssertThat(ts_node_type(last, document), Equals("true"));
+        AssertThat(ts_node_start_byte(last), Equals(strlen("  [123, faaaaalse, ")));
+      });
+    });
+
+    describe("when there is one unexpected token between two valid tokens", [&]() {
+      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, load_real_language("json"));
+        set_text("  [123, true false, true]");
+
+        assert_root_node(
+          "(array (number) (true) (ERROR (false)) (true))");
+
+        TSNode error = ts_node_named_child(root, 2);
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
+        AssertThat(get_node_text(error), Equals("false"));
+        AssertThat(ts_node_child_count(error), Equals<size_t>(1));
+
+        TSNode last = ts_node_named_child(root, 1);
+        AssertThat(ts_node_type(last, document), Equals("true"));
+        AssertThat(get_node_text(last), Equals("true"));
+      });
+    });
+
+    describe("when there is an unexpected string at the end of a token", [&]() {
+      it("computes the error's size and position correctly", [&]() {
+        ts_document_set_language(document, load_real_language("json"));
+        set_text("  [123, \"hi\n, true]");
+
+        assert_root_node(
+          "(array (number) (ERROR (UNEXPECTED '\\n')) (true))");
+      });
+    });
+
+    describe("when there is an unterminated error", [&]() {
+      it("maintains a consistent tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("a; /* b");
+        assert_root_node(
+          "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
+      });
+    });
+
+    describe("when there are extra tokens at the end of the viable prefix", [&]() {
+      it("does not include them in the error node", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text(
+          "var x;\n"
+          "\n"
+          "if\n"
+          "\n"
+          "var y;"
+        );
+
+        TSNode error = ts_node_named_child(root, 1);
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
+        AssertThat(ts_node_start_point(error), Equals<TSPoint>({2, 0}));
+        AssertThat(ts_node_end_point(error), Equals<TSPoint>({2, 2}));
+      });
+    });
+  });
+
+  describe("handling extra tokens", [&]() {
+    describe("when the token appears as part of a grammar rule", [&]() {
+      it("incorporates it into the tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("fn()\n");
+
+        assert_root_node(
+          "(program (expression_statement (function_call (identifier) (arguments))))");
+      });
+    });
+
+    describe("when the token appears somewhere else", [&]() {
+      it("incorporates it into the tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text(
+          "fn()\n"
+          "  .otherFn();");
+
+        assert_root_node(
+          "(program (expression_statement (function_call "
+            "(member_access "
+              "(function_call (identifier) (arguments)) "
+              "(identifier)) "
+            "(arguments))))");
+      });
+    });
+
+    describe("when several extra tokens appear in a row", [&]() {
+      it("incorporates them into the tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text(
+          "fn()\n\n"
+          "// This is a comment"
+          "\n\n"
+          ".otherFn();");
+
+        assert_root_node(
+          "(program (expression_statement (function_call "
+            "(member_access "
+              "(function_call (identifier) (arguments)) "
+              "(comment) "
+              "(identifier)) "
+            "(arguments))))");
+      });
+    });
+  });
+
+  describe("editing", [&]() {
+    describe("creating new tokens near the end of the input", [&]() {
+      it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("x * (100 + abc);");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(identifier) "
+            "(math_op (number) (identifier)))))");
+
+        insert_text(strlen("x * (100 + abc"), ".d");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(identifier) "
+            "(math_op (number) (member_access (identifier) (identifier))))))");
+
+        AssertThat(input->strings_read, Equals(vector<string>({ " + abc.d)" })));
+      });
+    });
+
+    describe("creating new tokens near the beginning of the input", [&]() {
+      it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
+        chunk_size = 2;
+
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("123 + 456 * (10 + x);");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(math_op (number) (math_op (number) (identifier))))))");
+
+        insert_text(strlen("123"), " || 5");
+
+        assert_root_node(
+          "(program (expression_statement (bool_op "
+            "(number) "
+            "(math_op "
+              "(number) "
+              "(math_op (number) (math_op (number) (identifier)))))))");
+
+        AssertThat(input->strings_read, Equals(vector<string>({ "123 || 5 +" })));
+      });
+    });
+
+    describe("introducing an error", [&]() {
+      it("gives the error the right size", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("var x = y;");
+
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (identifier))))");
+
+        insert_text(strlen("var x = y"), " *");
+
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (identifier)) (ERROR)))");
+
+        insert_text(strlen("var x = y *"), " z");
+
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (math_op (identifier) (identifier)))))");
+      });
+    });
+
+    describe("into the middle of an existing token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("abc * 123;");
+
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");
+
+        insert_text(strlen("ab"), "XYZ");
+
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");
+
+        TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
+        AssertThat(ts_node_type(node, document), Equals("identifier"));
+        AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc")));
+      });
+    });
+
+    describe("at the end of an existing token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("abc * 123;");
+
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");
+
+        insert_text(strlen("abc"), "XYZ");
+
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");
+
+        TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
+        AssertThat(ts_node_type(node, document), Equals("identifier"));
+        AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ")));
+      });
+    });
+
+    describe("inserting text into a node containing a extra token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("123 *\n"
+          "// a-comment\n"
+          "abc;");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(comment) "
+            "(identifier))))");
+
+        insert_text(
+          strlen("123 *\n"
+            "// a-comment\n"
+            "abc"),
+          "XYZ");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(comment) "
+            "(identifier))))");
+      });
+    });
+
+    describe("when a critical token is removed", [&]() {
+      it("updates the parse tree, creating an error", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("123 * 456; 789 * 123;");
+
+        assert_root_node(
+          "(program "
+            "(expression_statement (math_op (number) (number))) "
+            "(expression_statement (math_op (number) (number))))");
+
+        delete_text(strlen("123 "), 2);
+
+        assert_root_node(
+          "(program "
+            "(expression_statement (number) (ERROR (number))) "
+            "(expression_statement (math_op (number) (number))))");
+      });
+    });
+
+    describe("with external tokens", [&]() {
+      it("maintains the external scanner's state during incremental parsing", [&]() {
+        ts_document_set_language(document, load_real_language("python"));
+        string text = dedent(R"PYTHON(
+          if a:
+              print b
+          return c
+        )PYTHON");
+
+        set_text(text);
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier))) "
+          "(return_statement (expression_list (identifier))))");
+
+        replace_text(text.find("return"), 0, "    ");
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier)) "
+            "(return_statement (expression_list (identifier)))))");
+
+        undo();
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier))) "
+          "(return_statement (expression_list (identifier))))");
+      });
+    });
+
+    it("does not try to re-use nodes that are within the edited region", [&]() {
+      ts_document_set_language(document, load_real_language("javascript"));
+      set_text("{ x: (b.c) };");
+
+      assert_root_node(
+        "(program (expression_statement (object (pair "
+          "(identifier) (member_access (identifier) (identifier))))))");
+
+      replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c");
+
+      assert_root_node(
+        "(program (expression_statement (object (pair "
+          "(identifier) (member_access (identifier) (identifier))))))");
+    });
+
+    it("updates the document's parse count", [&]() {
+      ts_document_set_language(document, load_real_language("javascript"));
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(0));
+
+      set_text("{ x: (b.c) };");
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(1));
+
+      insert_text(strlen("{ x"), "yz");
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(2));
+    });
+  });
+
+  describe("lexing", [&]() {
+    describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
+      it("terminates them at the end of the document", [&]() {
+        ts_document_set_language(document, load_real_language("javascript"));
+        set_text("x; // this is a comment");
+
+        assert_root_node(
+          "(program (expression_statement (identifier)) (comment))");
+
+        TSNode comment = ts_node_named_child(root, 1);
+
+        AssertThat(ts_node_start_byte(comment), Equals(strlen("x; ")));
+        AssertThat(ts_node_end_byte(comment), Equals(strlen("x; // this is a comment")));
+      });
+    });
+
+    it("recognizes UTF8 characters as single characters", [&]() {
+      // 'ΩΩΩ — ΔΔ';
+      ts_document_set_language(document, load_real_language("javascript"));
+      set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';");
+
+      assert_root_node(
+        "(program (expression_statement (string)))");
+
+      AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';")));
+      AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';")));
+    });
+  });
+});
+
+END_TEST
--- a/test/runtime/stack_test.cc
+++ b/test/runtime/stack_test.cc
@ -0,0 +1,571 @@
+#include "test_helper.h"
+#include "helpers/tree_helpers.h"
+#include "helpers/point_helpers.h"
+#include "helpers/record_alloc.h"
+#include "helpers/stream_methods.h"
+#include "runtime/stack.h"
+#include "runtime/tree.h"
+#include "runtime/length.h"
+#include "runtime/alloc.h"
+
+enum {
+  stateA = 2,
+  stateB,
+  stateC, stateD, stateE, stateF, stateG, stateH, stateI, stateJ
+};
+
+enum {
+  symbol0, symbol1, symbol2, symbol3, symbol4, symbol5, symbol6, symbol7, symbol8,
+  symbol9, symbol10
+};
+
+Length operator*(const Length &length, uint32_t factor) {
+  return {length.bytes * factor, length.chars * factor, {0, length.extent.column * factor}};
+}
+
+void free_slice_array(StackSliceArray *slices) {
+  for (size_t i = 0; i < slices->size; i++) {
+    StackSlice slice = slices->contents[i];
+
+    bool matches_prior_trees = false;
+    for (size_t j = 0; j < i; j++) {
+      StackSlice prior_slice = slices->contents[j];
+      if (slice.trees.contents == prior_slice.trees.contents) {
+        matches_prior_trees = true;
+        break;
+      }
+    }
+
+    if (!matches_prior_trees) {
+      for (size_t j = 0; j < slice.trees.size; j++)
+        ts_tree_release(slice.trees.contents[j]);
+      array_delete(&slice.trees);
+    }
+  }
+}
+
+struct StackEntry {
+  TSStateId state;
+  size_t depth;
+};
+
+vector<StackEntry> get_stack_entries(Stack *stack, StackVersion version) {
+  vector<StackEntry> result;
+  ts_stack_iterate(
+    stack,
+    version,
+    [](void *payload, TSStateId state, TreeArray *trees, uint32_t tree_count, bool is_done, bool is_pending) -> StackIterateAction {
+      auto entries = static_cast<vector<StackEntry> *>(payload);
+      StackEntry entry = {state, tree_count};
+      if (find(entries->begin(), entries->end(), entry) == entries->end())
+        entries->push_back(entry);
+      return StackIterateNone;
+    }, &result);
+  return result;
+}
+
+START_TEST
+
+describe("Stack", [&]() {
+  Stack *stack;
+  const size_t tree_count = 11;
+  Tree *trees[tree_count];
+  Length tree_len = {2, 3, {0, 3}};
+
+  before_each([&]() {
+    record_alloc::start();
+
+    stack = ts_stack_new();
+
+    for (size_t i = 0; i < tree_count; i++)
+      trees[i] = ts_tree_make_leaf(i, length_zero(), tree_len, {
+        true, true, false, true,
+      });
+  });
+
+  after_each([&]() {
+    ts_stack_delete(stack);
+    for (size_t i = 0; i < tree_count; i++)
+      ts_tree_release(trees[i]);
+
+    record_alloc::stop();
+    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
+  });
+
+  describe("push(version, tree, is_pending, state)", [&]() {
+    it("adds entries to the given version of the stack", [&]() {
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(1));
+      AssertThat(ts_stack_top_state(stack, 0), Equals(1));
+      AssertThat(ts_stack_top_position(stack, 0), Equals(length_zero()));
+
+      // . <──0── A*
+      ts_stack_push(stack, 0, trees[0], false, stateA);
+      AssertThat(ts_stack_top_state(stack, 0), Equals(stateA));
+      AssertThat(ts_stack_top_position(stack, 0), Equals(tree_len));
+
+      // . <──0── A <──1── B*
+      ts_stack_push(stack, 0, trees[1], false, stateB);
+      AssertThat(ts_stack_top_state(stack, 0), Equals(stateB));
+      AssertThat(ts_stack_top_position(stack, 0), Equals(tree_len * 2));
+
+      // . <──0── A <──1── B <──2── C*
+      ts_stack_push(stack, 0, trees[2], false, stateC);
+      AssertThat(ts_stack_top_state(stack, 0), Equals(stateC));
+      AssertThat(ts_stack_top_position(stack, 0), Equals(tree_len * 3));
+
+      AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+        {stateC, 0},
+        {stateB, 1},
+        {stateA, 2},
+        {1, 3},
+      })));
+    });
+
+    it("increments the version's push count", [&]() {
+      AssertThat(ts_stack_push_count(stack, 0), Equals<unsigned>(0));
+      ts_stack_push(stack, 0, trees[0], false, stateA);
+      AssertThat(ts_stack_push_count(stack, 0), Equals<unsigned>(1));
+    });
+  });
+
+  describe("merge()", [&]() {
+    before_each([&]() {
+      // . <──0── A <──1── B*
+      //          ↑
+      //          └───2─── C*
+      ts_stack_push(stack, 0, trees[0], false, stateA);
+      ts_stack_copy_version(stack, 0);
+      ts_stack_push(stack, 0, trees[1], false, stateB);
+      ts_stack_push(stack, 1, trees[2], false, stateC);
+    });
+
+    it("combines versions that have the same top states and positions", [&]() {
+      // . <──0── A <──1── B <──3── D*
+      //          ↑
+      //          └───2─── C <──4── D*
+      ts_stack_push(stack, 0, trees[3], false, stateD);
+      ts_stack_push(stack, 1, trees[4], false, stateD);
+
+      // . <──0── A <──1── B <──3── D*
+      //          ↑                 |
+      //          └───2─── C <──4───┘
+      AssertThat(ts_stack_merge(stack, 0, 1), IsTrue());
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(1));
+      AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+        {stateD, 0},
+        {stateB, 1},
+        {stateC, 1},
+        {stateA, 2},
+        {1, 3},
+      })));
+    });
+
+    it("does not combine versions that have different states", [&]() {
+      AssertThat(ts_stack_merge(stack, 0, 1), IsFalse());
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+    });
+
+    it("does not combine versions that have different positions", [&]() {
+      // . <──0── A <──1── B <────3──── D*
+      //          ↑
+      //          └───2─── C <──4── D*
+      trees[3]->size = tree_len * 3;
+      ts_stack_push(stack, 0, trees[3], false, stateD);
+      ts_stack_push(stack, 1, trees[4], false, stateD);
+
+      AssertThat(ts_stack_merge(stack, 0, 1), IsFalse());
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+    });
+
+    describe("when the merged versions have more than one common entry", [&]() {
+      it("combines all of the top common entries", [&]() {
+        // . <──0── A <──1── B <──3── D <──5── E*
+        //          ↑
+        //          └───2─── C <──4── D <──5── E*
+        ts_stack_push(stack, 0, trees[3], false, stateD);
+        ts_stack_push(stack, 0, trees[5], false, stateE);
+        ts_stack_push(stack, 1, trees[4], false, stateD);
+        ts_stack_push(stack, 1, trees[5], false, stateE);
+
+        // . <──0── A <──1── B <──3── D <──5── E*
+        //          ↑                 |
+        //          └───2─── C <──4───┘
+        AssertThat(ts_stack_merge(stack, 0, 1), IsTrue());
+        AssertThat(ts_stack_version_count(stack), Equals<size_t>(1));
+        AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+          {stateE, 0},
+          {stateD, 1},
+          {stateB, 2},
+          {stateC, 2},
+          {stateA, 3},
+          {1, 4},
+        })));
+      });
+    });
+  });
+
+  describe("pop_count(version, count)", [&]() {
+    before_each([&]() {
+      // . <──0── A <──1── B <──2── C*
+      ts_stack_push(stack, 0, trees[0], false, stateA);
+      ts_stack_push(stack, 0, trees[1], false, stateB);
+      ts_stack_push(stack, 0, trees[2], false, stateC);
+    });
+
+    it("creates a new version with the given number of entries removed", [&]() {
+      // . <──0── A <──1── B <──2── C*
+      //          ↑
+      //          └─*
+      StackPopResult pop = ts_stack_pop_count(stack, 0, 2);
+      AssertThat(pop.stopped_at_error, Equals(false));
+      AssertThat(pop.slices.size, Equals<size_t>(1));
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+
+      StackSlice slice = pop.slices.contents[0];
+      AssertThat(slice.version, Equals<StackVersion>(1));
+      AssertThat(slice.trees, Equals(vector<Tree *>({ trees[1], trees[2] })));
+      AssertThat(ts_stack_top_state(stack, 1), Equals(stateA));
+
+      free_slice_array(&pop.slices);
+    });
+
+    it("does not count 'extra' trees toward the given count", [&]() {
+      trees[1]->extra = true;
+
+      // . <──0── A <──1── B <──2── C*
+      // ↑
+      // └─*
+      StackPopResult pop = ts_stack_pop_count(stack, 0, 2);
+      AssertThat(pop.stopped_at_error, Equals(false));
+      AssertThat(pop.slices.size, Equals<size_t>(1));
+
+      StackSlice slice = pop.slices.contents[0];
+      AssertThat(slice.trees, Equals(vector<Tree *>({ trees[0], trees[1], trees[2] })));
+      AssertThat(ts_stack_top_state(stack, 1), Equals(1));
+
+      free_slice_array(&pop.slices);
+    });
+
+    it("stops popping entries early if it reaches an error tree", [&]() {
+      // . <──0── A <──1── B <──2── C <──3── ERROR <──4── D*
+      ts_stack_push(stack, 0, trees[3], false, ERROR_STATE);
+      ts_stack_push(stack, 0, trees[4], false, stateD);
+
+      // . <──0── A <──1── B <──2── C <──3── ERROR <──4── D*
+      //                                       ↑
+      //                                       └─*
+      StackPopResult pop = ts_stack_pop_count(stack, 0, 3);
+      AssertThat(pop.stopped_at_error, Equals(true));
+
+      AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+      AssertThat(ts_stack_top_state(stack, 1), Equals(ERROR_STATE));
+
+      AssertThat(pop.slices.size, Equals<size_t>(1));
+      StackSlice slice = pop.slices.contents[0];
+      AssertThat(slice.version, Equals<StackVersion>(1));
+      AssertThat(slice.trees, Equals(vector<Tree *>({ trees[4] })));
+
+      free_slice_array(&pop.slices);
+    });
+
+    it("preserves the push count of the popped version", [&]() {
+      // . <──0── A <──1── B <──2── C*
+      //          ↑
+      //          └─*
+      StackPopResult pop = ts_stack_pop_count(stack, 0, 2);
+
+      AssertThat(ts_stack_push_count(stack, 0), Equals<unsigned>(3));
+      AssertThat(ts_stack_push_count(stack, 1), Equals<unsigned>(3));
+
+      free_slice_array(&pop.slices);
+    });
+
+    describe("when the version has been merged", [&]() {
+      before_each([&]() {
+        // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+        //          ↑                          |
+        //          └───4─── E <──5── F <──6───┘
+        ts_stack_push(stack, 0, trees[3], false, stateD);
+        StackPopResult pop = ts_stack_pop_count(stack, 0, 3);
+        free_slice_array(&pop.slices);
+        ts_stack_push(stack, 1, trees[4], false, stateE);
+        ts_stack_push(stack, 1, trees[5], false, stateF);
+        ts_stack_push(stack, 1, trees[6], false, stateD);
+        ts_stack_merge(stack, 0, 1);
+        ts_stack_push(stack, 0, trees[10], false, stateI);
+
+        AssertThat(ts_stack_version_count(stack), Equals<size_t>(1));
+        AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+          {stateI, 0},
+          {stateD, 1},
+          {stateC, 2},
+          {stateF, 2},
+          {stateB, 3},
+          {stateE, 3},
+          {stateA, 4},
+          {1, 5},
+        })));
+      });
+
+      describe("when there are two paths that reveal different versions", [&]() {
+        it("returns an entry for each revealed version", [&]() {
+          // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+          //          ↑        ↑
+          //          |        └*
+          //          |
+          //          └───4─── E*
+          StackPopResult pop = ts_stack_pop_count(stack, 0, 3);
+          AssertThat(pop.slices.size, Equals<size_t>(2));
+
+          StackSlice slice1 = pop.slices.contents[0];
+          AssertThat(slice1.version, Equals<StackVersion>(1));
+          AssertThat(slice1.trees, Equals(vector<Tree *>({ trees[2], trees[3], trees[10] })));
+
+          StackSlice slice2 = pop.slices.contents[1];
+          AssertThat(slice2.version, Equals<StackVersion>(2));
+          AssertThat(slice2.trees, Equals(vector<Tree *>({ trees[5], trees[6], trees[10] })));
+
+          AssertThat(ts_stack_version_count(stack), Equals<size_t>(3));
+          AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+            {stateI, 0},
+            {stateD, 1},
+            {stateC, 2},
+            {stateF, 2},
+            {stateB, 3},
+            {stateE, 3},
+            {stateA, 4},
+            {1, 5},
+          })));
+          AssertThat(get_stack_entries(stack, 1), Equals(vector<StackEntry>({
+            {stateB, 0},
+            {stateA, 1},
+            {1, 2},
+          })));
+          AssertThat(get_stack_entries(stack, 2), Equals(vector<StackEntry>({
+            {stateE, 0},
+            {stateA, 1},
+            {1, 2},
+          })));
+
+          free_slice_array(&pop.slices);
+        });
+      });
+
+      describe("when there is one path that ends at a merged version", [&]() {
+        it("returns a single entry", [&]() {
+          // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+          //          |                          |
+          //          └───5─── F <──6── G <──7───┘
+          //                                     |
+          //                                     └*
+          StackPopResult pop = ts_stack_pop_count(stack, 0, 1);
+          AssertThat(pop.slices.size, Equals<size_t>(1));
+
+          StackSlice slice1 = pop.slices.contents[0];
+          AssertThat(slice1.version, Equals<StackVersion>(1));
+          AssertThat(slice1.trees, Equals(vector<Tree *>({ trees[10] })));
+
+          AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+          AssertThat(ts_stack_top_state(stack, 0), Equals(stateI));
+          AssertThat(ts_stack_top_state(stack, 1), Equals(stateD));
+
+          free_slice_array(&pop.slices);
+        });
+      });
+
+      describe("when there are two paths that converge on one version", [&]() {
+        it("returns two slices with the same version", [&]() {
+          // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+          //          ↑                          |
+          //          ├───4─── E <──5── F <──6───┘
+          //          |
+          //          └*
+          StackPopResult pop = ts_stack_pop_count(stack, 0, 4);
+          AssertThat(pop.slices.size, Equals<size_t>(2));
+
+          StackSlice slice1 = pop.slices.contents[0];
+          AssertThat(slice1.version, Equals<StackVersion>(1));
+          AssertThat(slice1.trees, Equals(vector<Tree *>({ trees[1], trees[2], trees[3], trees[10] })));
+
+          StackSlice slice2 = pop.slices.contents[1];
+          AssertThat(slice2.version, Equals<StackVersion>(1));
+          AssertThat(slice2.trees, Equals(vector<Tree *>({ trees[4], trees[5], trees[6], trees[10] })))
+
+          AssertThat(ts_stack_version_count(stack), Equals<size_t>(2));
+          AssertThat(ts_stack_top_state(stack, 0), Equals(stateI));
+          AssertThat(ts_stack_top_state(stack, 1), Equals(stateA));
+
+          free_slice_array(&pop.slices);
+        });
+      });
+
+      describe("when there are three paths that lead to three different versions", [&]() {
+        it("returns three entries with different arrays of trees", [&]() {
+          // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+          //          ↑                          |
+          //          ├───4─── E <──5── F <──6───┘
+          //          |                          |
+          //          └───7─── G <──8── H <──9───┘
+          StackPopResult pop = ts_stack_pop_count(stack, 0, 4);
+          free_slice_array(&pop.slices);
+          ts_stack_push(stack, 1, trees[7], false, stateG);
+          ts_stack_push(stack, 1, trees[8], false, stateH);
+          ts_stack_push(stack, 1, trees[9], false, stateD);
+          ts_stack_push(stack, 1, trees[10], false, stateI);
+          ts_stack_merge(stack, 0, 1);
+
+          AssertThat(ts_stack_version_count(stack), Equals<size_t>(1));
+          AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+            {stateI, 0},
+            {stateD, 1},
+            {stateC, 2},
+            {stateF, 2},
+            {stateH, 2},
+            {stateB, 3},
+            {stateE, 3},
+            {stateG, 3},
+            {stateA, 4},
+            {1, 5},
+          })));
+
+          // . <──0── A <──1── B <──2── C <──3── D <──10── I*
+          //          ↑                 ↑
+          //          |                 └*
+          //          |
+          //          ├───4─── E <──5── F*
+          //          |
+          //          └───7─── G <──8── H*
+          pop = ts_stack_pop_count(stack, 0, 2);
+          AssertThat(pop.slices.size, Equals<size_t>(3));
+
+          StackSlice slice1 = pop.slices.contents[0];
+          AssertThat(slice1.version, Equals<StackVersion>(1));
+          AssertThat(slice1.trees, Equals(vector<Tree *>({ trees[3], trees[10] })))
+
+          StackSlice slice2 = pop.slices.contents[1];
+          AssertThat(slice2.version, Equals<StackVersion>(2));
+          AssertThat(slice2.trees, Equals(vector<Tree *>({ trees[6], trees[10] })))
+
+          StackSlice slice3 = pop.slices.contents[2];
+          AssertThat(slice3.version, Equals<StackVersion>(3));
+          AssertThat(slice3.trees, Equals(vector<Tree *>({ trees[9], trees[10] })))
+
+          AssertThat(ts_stack_version_count(stack), Equals<size_t>(4));
+          AssertThat(ts_stack_top_state(stack, 0), Equals(stateI));
+          AssertThat(ts_stack_top_state(stack, 1), Equals(stateC));
+          AssertThat(ts_stack_top_state(stack, 2), Equals(stateF));
+          AssertThat(ts_stack_top_state(stack, 3), Equals(stateH));
+
+          free_slice_array(&pop.slices);
+        });
+      });
+    });
+  });
+
+  describe("pop_pending(version)", [&]() {
+    before_each([&]() {
+      ts_stack_push(stack, 0, trees[0], false, stateA);
+    });
+
+    it("removes the top node from the stack if it was pushed in pending mode", [&]() {
+      ts_stack_push(stack, 0, trees[1], true, stateB);
+
+      StackPopResult pop = ts_stack_pop_pending(stack, 0);
+      AssertThat(pop.stopped_at_error, Equals(false));
+      AssertThat(pop.slices.size, Equals<size_t>(1));
+
+      AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+        {stateA, 0},
+        {1, 1},
+      })));
+
+      free_slice_array(&pop.slices);
+    });
+
+    it("skips entries whose trees are extra", [&]() {
+      ts_stack_push(stack, 0, trees[1], true, stateB);
+
+      trees[2]->extra = true;
+      trees[3]->extra = true;
+
+      ts_stack_push(stack, 0, trees[2], false, stateB);
+      ts_stack_push(stack, 0, trees[3], false, stateB);
+
+      StackPopResult pop = ts_stack_pop_pending(stack, 0);
+      AssertThat(pop.stopped_at_error, Equals(false));
+      AssertThat(pop.slices.size, Equals<size_t>(1));
+
+      AssertThat(pop.slices.contents[0].trees, Equals(vector<Tree *>({ trees[1], trees[2], trees[3] })));
+
+      AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+        {stateA, 0},
+        {1, 1},
+      })));
+
+      free_slice_array(&pop.slices);
+    });
+
+    it("does nothing if the top node was not pushed in pending mode", [&]() {
+      ts_stack_push(stack, 0, trees[1], false, stateB);
+
+      StackPopResult pop = ts_stack_pop_pending(stack, 0);
+      AssertThat(pop.stopped_at_error, Equals(false));
+      AssertThat(pop.slices.size, Equals<size_t>(0));
+
+      AssertThat(get_stack_entries(stack, 0), Equals(vector<StackEntry>({
+        {stateB, 0},
+        {stateA, 1},
+        {1, 2},
+      })));
+
+      free_slice_array(&pop.slices);
+    });
+  });
+
+  describe("setting external token state", [&]() {
+    TSExternalTokenState external_token_state1, external_token_state2;
+
+    it("allows the state to be retrieved", [&]() {
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(nullptr));
+
+      ts_stack_set_external_token_state(stack, 0, &external_token_state1);
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1));
+
+      ts_stack_copy_version(stack, 0);
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1));
+    });
+
+    it("does not merge stack versions with different external token states", [&]() {
+      ts_stack_copy_version(stack, 0);
+      ts_stack_push(stack, 0, trees[0], false, 5);
+      ts_stack_push(stack, 1, trees[0], false, 5);
+
+      ts_stack_set_external_token_state(stack, 0, &external_token_state1);
+      ts_stack_set_external_token_state(stack, 0, &external_token_state2);
+
+      AssertThat(ts_stack_merge(stack, 0, 1), IsFalse());
+    });
+  });
+});
+
+END_TEST
+
+bool operator==(const StackEntry &left, const StackEntry &right) {
+  return left.state == right.state && left.depth == right.depth;
+}
+
+std::ostream &operator<<(std::ostream &stream, const StackEntry &entry) {
+  return stream << "{" << entry.state << ", " << entry.depth << "}";
+}
+
+std::ostream &operator<<(std::ostream &stream, const TreeArray &array) {
+  stream << "[";
+  bool first = true;
+  for (size_t i = 0; i < array.size; i++) {
+    if (!first)
+      stream << ", ";
+    first = false;
+    stream << array.contents[i];
+  }
+  return stream << "]";
+}
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`Every token in a grammar is given a name in the generated parser. Anonymous tokens (tokens specified directly in the body of some larger rule) are named according their content. So when tokens contains characters that aren't valid in a C string literal, we need to escape those characters. This grammar tests that this escaping works. The test is basically that the generated parser compiles succesfully.`
				`@ -0,0 +1 @@`
				This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an internal token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field.
				`@ -0,0 +1 @@`
				This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`.