Rename spec -> test

'Test' is a lot more straightforward of a name.
2017-03-09 20:40:01 -08:00 · 2017-03-09 20:40:01 -08:00 · 6dc0ff359d
commit 6dc0ff359d
parent 7d8daf573e
109 changed files with 44 additions and 44 deletions
--- a/test/compiler/build_tables/lex_conflict_manager_test.cc
+++ b/test/compiler/build_tables/lex_conflict_manager_test.cc
@ -0,0 +1,89 @@
+#include "test_helper.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "compiler/parse_table.h"
+#include "compiler/build_tables/lex_conflict_manager.h"
+#include "compiler/build_tables/lex_item.h"
+
+using namespace rules;
+using namespace build_tables;
+
+START_TEST
+
+describe("LexConflictManager::resolve(new_action, old_action)", []() {
+  LexConflictManager conflict_manager;
+  bool update;
+  Symbol sym1(0, Symbol::Terminal);
+  Symbol sym2(1, Symbol::Terminal);
+  Symbol sym3(2, Symbol::Terminal);
+  Symbol sym4(3, Symbol::Terminal);
+  LexItemSet item_set({ LexItem(sym4, blank() )});
+
+  before_each([&]() {
+    conflict_manager = LexConflictManager();
+  });
+
+  it("favors advance actions over empty accept token actions", [&]() {
+    update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction());
+    AssertThat(update, IsTrue());
+  });
+
+  describe("accept-token/accept-token conflicts", [&]() {
+    describe("when the tokens' precedence values differ", [&]() {
+      it("favors the token with higher precedence", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 2, false), AcceptTokenAction(sym2, 1, false));
+        AssertThat(update, IsTrue());
+      });
+
+      it("adds the preferred token as a possible homonym for the discarded one", [&]() {
+        conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
+        AssertThat(conflict_manager.possible_homonyms[sym2.index], Contains(sym1.index));
+      });
+    });
+
+    describe("when one token is string-based and the other is regexp-based", [&]() {
+      it("favors the string-based token", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, true));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, true), AcceptTokenAction(sym1, 0, false));
+        AssertThat(update, IsTrue());
+      });
+    });
+
+    describe("when the tokens have equal precedence", [&]() {
+      it("favors the token listed earlier in the grammar", [&]() {
+        update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, false), AcceptTokenAction(sym1, 0, false));
+        AssertThat(update, IsFalse());
+
+        update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, false));
+        AssertThat(update, IsTrue());
+      });
+    });
+  });
+
+  describe("advance/accept-token conflicts", [&]() {
+    describe("when the token to accept has higher precedence", [&]() {
+      it("prefers the accept-token action", [&]() {
+        AssertThat(conflict_manager.possible_extensions, IsEmpty());
+        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
+        AssertThat(update, IsFalse());
+        AssertThat(conflict_manager.possible_extensions, IsEmpty());
+      });
+    });
+
+    describe("when the token to accept does not have a higher precedence", [&]() {
+      it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
+        update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
+        AssertThat(update, IsTrue());
+        AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
+      });
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/lex_item_test.cc
+++ b/test/compiler/build_tables/lex_item_test.cc
@ -0,0 +1,514 @@
+#include "test_helper.h"
+#include "compiler/build_tables/lex_item.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/rules.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+using namespace rules;
+using namespace build_tables;
+typedef LexItemSet::Transition Transition;
+
+START_TEST
+
+describe("LexItem", []() {
+  describe("completion_status()", [&]() {
+    it("indicates whether the item is done and its precedence", [&]() {
+      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
+      AssertThat(item1.completion_status().is_done, IsFalse());
+      AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
+
+      MetadataParams params;
+      params.precedence = 3;
+      params.has_precedence = true;
+      params.is_string = 1;
+      LexItem item2(Symbol(0, Symbol::Terminal), choice({
+        metadata(blank(), params),
+        character({ 'a', 'b', 'c' })
+      }));
+
+      AssertThat(item2.completion_status().is_done, IsTrue());
+      AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
+
+      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
+      AssertThat(item3.completion_status().is_done, IsTrue());
+      AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
+    });
+  });
+});
+
+describe("LexItemSet::transitions()", [&]() {
+  it("handles single characters", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("marks transitions that are within the main token (as opposed to separators)", [&]() {
+    MetadataParams params;
+    params.is_main_token = true;
+
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)),
+            }),
+            PrecedenceRange(),
+            true
+          }
+        }
+      })));
+  });
+
+  it("handles sequences", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        character({ 'w' }),
+        character({ 'x' }),
+        character({ 'y' }),
+        character({ 'z' }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('w'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'x' }),
+                character({ 'y' }),
+                character({ 'z' }),
+              })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles sequences with nested precedence", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        prec(3, seq({
+          character({ 'v' }),
+          prec(4, seq({
+            character({ 'w' }),
+            character({ 'x' }) })),
+          character({ 'y' }) })),
+        character({ 'z' }),
+      })),
+    });
+
+    auto transitions = item_set.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('v'),
+          Transition{
+            // The outer precedence is now 'active', because we are within its
+            // contained rule.
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, seq({
+                  prec(4, seq({
+                    character({ 'w' }),
+                    character({ 'x' }) })),
+                  character({ 'y' }) })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // No precedence is applied upon entering a rule.
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set2 = transitions[CharacterSet().include('v')].destination;
+    transitions = item_set2.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('w'),
+          Transition{
+            // The inner precedence is now 'active'
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, seq({
+                  active_prec(4, character({ 'x' })),
+                  character({ 'y' }) })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // The outer precedence is applied.
+            PrecedenceRange(3),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set3 = transitions[CharacterSet().include('w')].destination;
+    transitions = item_set3.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                active_prec(3, character({ 'y' })),
+                character({ 'z' }),
+              })),
+            }),
+
+            // The inner precedence is applied.
+            PrecedenceRange(4),
+            false
+          }
+        }
+      })));
+
+    LexItemSet item_set4 = transitions[CharacterSet().include('x')].destination;
+    transitions = item_set4.transitions();
+
+    AssertThat(
+      transitions,
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('y'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
+            }),
+            PrecedenceRange(3),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles sequences where the left hand side can be blank", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        choice({
+          character({ 'x' }),
+          blank(),
+        }),
+        character({ 'y' }),
+        character({ 'z' }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('x'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'y' }),
+                character({ 'z' }),
+              })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('y'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles blanks", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+    });
+
+    AssertThat(item_set.transitions(), IsEmpty());
+  });
+
+  it("handles repeats", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({
+        character({ 'a' }),
+        character({ 'b' }),
+      }))),
+      LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
+                character({ 'b' }),
+                repeat1(seq({
+                  character({ 'a' }),
+                  character({ 'b' }),
+                }))
+              })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('c'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+              LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles repeats with precedence", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' }))))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())),
+            }),
+            PrecedenceRange(-1),
+            false
+          }
+        }
+      })));
+  });
+
+  it("handles choices between overlapping character sets", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
+        active_prec(2, seq({
+          character({ 'a', 'b', 'c', 'd'  }),
+          character({ 'x' }),
+        })),
+        active_prec(3, seq({
+          character({ 'c', 'd', 'e', 'f' }),
+          character({ 'y' }),
+        })),
+      }))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a', 'b'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+            }),
+            PrecedenceRange(2),
+            false
+          }
+        },
+        {
+          CharacterSet().include('c', 'd'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
+            }),
+            PrecedenceRange(2, 3),
+            false
+          }
+        },
+        {
+          CharacterSet().include('e', 'f'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
+            }),
+            PrecedenceRange(3),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles choices between a subset and a superset of characters", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
+        seq({
+          character({ 'b', 'c', 'd' }),
+          character({ 'x' }),
+        }),
+        seq({
+          character({ 'a', 'b', 'c', 'd', 'e', 'f' }),
+          character({ 'y' }),
+        }),
+      })),
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include('a').include('e', 'f'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('b', 'd'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles choices between whitelisted and blacklisted character sets", [&]() {
+    LexItemSet item_set({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
+        choice({
+          character({ '/' }, false),
+          seq({
+            character({ '\\' }),
+            character({ '/' }),
+          }),
+        }),
+        character({ '/' }),
+      }))
+    });
+
+    AssertThat(
+      item_set.transitions(),
+      Equals(LexItemSet::TransitionMap({
+        {
+          CharacterSet().include_all().exclude('/').exclude('\\'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+        {
+          CharacterSet().include('\\'),
+          Transition{
+            LexItemSet({
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })),
+            }),
+            PrecedenceRange(),
+            false
+          }
+        },
+      })));
+  });
+
+  it("handles different items with overlapping character sets", [&]() {
+    LexItemSet set1({
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
+      LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' }))
+    });
+
+    AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({
+      {
+        CharacterSet().include('a', 'd'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+      {
+        CharacterSet().include('e', 'f'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+      {
+        CharacterSet().include('g', 'i'),
+        Transition{
+          LexItemSet({
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
+          }),
+          PrecedenceRange(),
+          false
+        }
+      },
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/parse_item_set_builder_test.cc
+++ b/test/compiler/build_tables/parse_item_set_builder_test.cc
@ -0,0 +1,134 @@
+#include "test_helper.h"
+#include "compiler/syntax_grammar.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/build_tables/parse_item_set_builder.h"
+#include "compiler/build_tables/lookahead_set.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "helpers/rule_helpers.h"
+
+using namespace build_tables;
+using namespace rules;
+
+START_TEST
+
+describe("ParseItemSetBuilder", []() {
+  vector<LexicalVariable> lexical_variables;
+  for (size_t i = 0; i < 20; i++) {
+    lexical_variables.push_back({
+      "token_" + to_string(i),
+      VariableTypeNamed,
+      blank(),
+      false
+    });
+  }
+
+  LexicalGrammar lexical_grammar{lexical_variables, {}};
+
+  it("adds items at the beginnings of referenced rules", [&]() {
+    SyntaxGrammar grammar{{
+      SyntaxVariable{"rule0", VariableTypeNamed, {
+        Production({
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
+        }),
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
+        Production({
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
+        }),
+        Production({
+          {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
+        })
+      }},
+      SyntaxVariable{"rule2", VariableTypeNamed, {
+        Production({
+          {Symbol(14, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(15, Symbol::Terminal), 0, AssociativityNone},
+        })
+      }},
+    }, {}, {}, {}};
+
+    auto production = [&](int variable_index, int production_index) -> const Production & {
+      return grammar.variables[variable_index].productions[production_index];
+    };
+
+    ParseItemSet item_set({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
+      }
+    });
+
+    ParseItemSetBuilder item_set_builder(grammar, lexical_grammar);
+    item_set_builder.apply_transitive_closure(&item_set);
+
+    AssertThat(item_set, Equals(ParseItemSet({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+        },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+    })));
+  });
+
+  it("handles rules with empty productions", [&]() {
+    SyntaxGrammar grammar{{
+      SyntaxVariable{"rule0", VariableTypeNamed, {
+        Production({
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
+        }),
+      }},
+      SyntaxVariable{"rule1", VariableTypeNamed, {
+        Production({
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
+        }),
+        Production({})
+      }},
+    }, {}, {}, {}};
+
+    auto production = [&](int variable_index, int production_index) -> const Production & {
+      return grammar.variables[variable_index].productions[production_index];
+    };
+
+    ParseItemSet item_set({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
+      }
+    });
+
+    ParseItemSetBuilder item_set_builder(grammar, lexical_grammar);
+    item_set_builder.apply_transitive_closure(&item_set);
+
+    AssertThat(item_set, Equals(ParseItemSet({
+      {
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
+      },
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/build_tables/rule_can_be_blank_test.cc
+++ b/test/compiler/build_tables/rule_can_be_blank_test.cc
@ -0,0 +1,60 @@
+#include "test_helper.h"
+#include "compiler/build_tables/rule_can_be_blank.h"
+#include "compiler/rules/metadata.h"
+#include "compiler/rules.h"
+#include "helpers/rule_helpers.h"
+
+using namespace rules;
+using build_tables::rule_can_be_blank;
+
+START_TEST
+
+describe("rule_can_be_blank", [&]() {
+  rule_ptr rule;
+
+  it("returns false for basic rules", [&]() {
+    AssertThat(rule_can_be_blank(i_sym(3)), IsFalse());
+    AssertThat(rule_can_be_blank(str("x")), IsFalse());
+    AssertThat(rule_can_be_blank(pattern("x")), IsFalse());
+  });
+
+  it("returns true for blanks", [&]() {
+    AssertThat(rule_can_be_blank(blank()), IsTrue());
+  });
+
+  it("returns true for repeats", [&]() {
+    AssertThat(rule_can_be_blank(repeat(str("x"))), IsTrue());
+  });
+
+  it("returns true for choices iff one or more sides can be blank", [&]() {
+    rule = choice({ sym("x"), blank() });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = choice({ blank(), sym("x") });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = choice({ sym("x"), sym("y") });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+  });
+
+  it("returns true for sequences iff both sides can be blank", [&]() {
+    rule = seq({ blank(), str("x") });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+
+    rule = seq({ str("x"), blank() });
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+
+    rule = seq({ blank(), choice({ sym("x"), blank() }) });
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+  });
+
+  it("ignores metadata rules", [&]() {
+    rule = make_shared<rules::Metadata>(blank(), MetadataParams());
+    AssertThat(rule_can_be_blank(rule), IsTrue());
+
+    rule = make_shared<rules::Metadata>(sym("one"), MetadataParams());
+    AssertThat(rule_can_be_blank(rule), IsFalse());
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/expand_repeats_test.cc
+++ b/test/compiler/prepare_grammar/expand_repeats_test.cc
@ -0,0 +1,171 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/prepare_grammar/expand_repeats.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::InitialSyntaxGrammar;
+using prepare_grammar::expand_repeats;
+
+describe("expand_repeats", []() {
+  it("replaces repeat rules with pairs of recursive rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(0))},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(1)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(0) }),
+        i_token(0),
+      })},
+    }));
+  });
+
+  it("replaces repeats inside of sequences", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          i_token(10),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
+        i_token(10),
+        i_sym(1),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(11) }),
+        i_token(11)
+      })},
+    }));
+  });
+
+  it("replaces repeats inside of choices", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          i_token(10),
+          repeat1(i_token(11))
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
+        i_token(10),
+        i_sym(1),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+
+  it("does not create redundant auxiliary rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, choice({
+          seq({ i_token(1), repeat1(i_token(4)) }),
+          seq({ i_token(2), repeat1(i_token(4)) }),
+        })},
+        Variable{"rule1", VariableTypeNamed, seq({
+          i_token(3),
+          repeat1(i_token(4))
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, choice({
+        seq({ i_token(1), i_sym(2) }),
+        seq({ i_token(2), i_sym(2) }),
+      })},
+      Variable{"rule1", VariableTypeNamed, seq({
+        i_token(3),
+        i_sym(2),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(4) }),
+        i_token(4),
+      })},
+    }));
+  });
+
+  it("can replace multiple repeats in the same rule", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, seq({
+          repeat1(i_token(10)),
+          repeat1(i_token(11)),
+        })},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, seq({
+        i_sym(1),
+        i_sym(2),
+      })},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(1), i_token(10) }),
+        i_token(10),
+      })},
+      Variable{"rule0_repeat2", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+
+  it("can replace repeats in multiple rules", [&]() {
+    InitialSyntaxGrammar grammar{
+      {
+        Variable{"rule0", VariableTypeNamed, repeat1(i_token(10))},
+        Variable{"rule1", VariableTypeNamed, repeat1(i_token(11))},
+      },
+      {}, {}, {}
+    };
+
+    auto result = expand_repeats(grammar);
+
+    AssertThat(result.variables, Equals(vector<Variable>{
+      Variable{"rule0", VariableTypeNamed, i_sym(2)},
+      Variable{"rule1", VariableTypeNamed, i_sym(3)},
+      Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(2), i_token(10) }),
+        i_token(10),
+      })},
+      Variable{"rule1_repeat1", VariableTypeAuxiliary, choice({
+        seq({ i_sym(3), i_token(11) }),
+        i_token(11),
+      })},
+    }));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/expand_tokens_test.cc
+++ b/test/compiler/prepare_grammar/expand_tokens_test.cc
@ -0,0 +1,169 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/prepare_grammar/expand_tokens.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::expand_tokens;
+
+describe("expand_tokens", []() {
+  MetadataParams string_token_params;
+  string_token_params.is_string = true;
+  string_token_params.is_token = true;
+
+  describe("string rules", [&]() {
+    it("replaces strings with sequences of character sets", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError::none()));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            metadata(seq({
+              character({ 'x' }),
+              character({ 'y' }),
+              character({ 'z' }),
+            }), string_token_params),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
+    });
+
+    it("handles strings containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            str("\u03B1 \u03B2"),
+            false
+          },
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          metadata(seq({
+            character({ 945 }),
+            character({ ' ' }),
+            character({ 946 }),
+          }), string_token_params),
+          false
+        }
+      }));
+    });
+  });
+
+  describe("regexp rules", [&]() {
+    it("replaces regexps with the equivalent rule tree", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11),
+            }),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError::none()));
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          seq({
+            i_sym(10),
+            repeat(character({ 'x' })),
+            i_sym(11),
+          }),
+          false
+        }
+      }));
+    });
+
+    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            pattern("[^\u03B1-\u03B4]*"),
+            false
+          }
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
+        LexicalVariable{
+          "rule_A",
+          VariableTypeNamed,
+          repeat(character({ 945, 946, 947, 948 }, false)),
+          false
+        }
+      }));
+    });
+
+    it("returns an error when the grammar contains an invalid regex", [&]() {
+      LexicalGrammar grammar{
+        {
+          LexicalVariable{
+            "rule_A",
+            VariableTypeNamed,
+            seq({
+              pattern("("),
+              str("xyz"),
+              pattern("["),
+            }),
+            false
+          },
+        },
+        {}
+      };
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, Equals(CompileError(TSCompileErrorTypeInvalidRegex, "unmatched open paren")));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/extract_choices_test.cc
+++ b/test/compiler/prepare_grammar/extract_choices_test.cc
@ -0,0 +1,106 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/extract_choices.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::extract_choices;
+
+class rule_vector : public vector<rule_ptr> {
+ public:
+  bool operator==(const vector<rule_ptr> &other) const {
+    if (this->size() != other.size()) return false;
+    for (size_t i = 0; i < this->size(); i++) {
+      auto rule = this->operator[](i);
+      auto other_rule = other[i];
+      if (!rule->operator==(*rule))
+        return false;
+    }
+    return true;
+  }
+
+  rule_vector(const initializer_list<rule_ptr> &list) :
+      vector<rule_ptr>(list) {}
+};
+
+describe("extract_choices", []() {
+  it("expands rules containing choices into multiple rules", [&]() {
+    auto rule = seq({
+      sym("a"),
+      choice({ sym("b"), sym("c"), sym("d") }),
+      sym("e")
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({ sym("a"), sym("b"), sym("e") }),
+      seq({ sym("a"), sym("c"), sym("e") }),
+      seq({ sym("a"), sym("d"), sym("e") }),
+    })));
+  });
+
+  it("handles metadata rules", [&]() {
+    auto rule = prec(5, choice({ sym("b"), sym("c"), sym("d") }));
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      prec(5, sym("b")),
+      prec(5, sym("c")),
+      prec(5, sym("d")),
+    })));
+  });
+
+  it("handles nested choices", [&]() {
+    auto rule = choice({
+      seq({ choice({ sym("a"), sym("b") }), sym("c") }),
+      sym("d")
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({ sym("a"), sym("c") }),
+      seq({ sym("b"), sym("c") }),
+      sym("d"),
+    })));
+  });
+
+  it("handles blank rules", [&]() {
+    AssertThat(extract_choices(blank()), Equals(rule_vector({
+      blank(),
+    })));
+  });
+
+  it("does not move choices outside of repeats", [&]() {
+    auto rule = seq({
+      choice({ sym("a"), sym("b") }),
+      repeat1(seq({
+        sym("c"),
+        choice({
+          sym("d"),
+          sym("e"),
+        }),
+        sym("f"),
+      })),
+      sym("g"),
+    });
+
+    AssertThat(extract_choices(rule), Equals(rule_vector({
+      seq({
+        sym("a"),
+        repeat1(choice({
+          seq({ sym("c"), sym("d"), sym("f") }),
+          seq({ sym("c"), sym("e"), sym("f") }),
+        })),
+        sym("g"),
+      }),
+      seq({
+        sym("b"),
+        repeat1(choice({
+          seq({ sym("c"), sym("d"), sym("f") }),
+          seq({ sym("c"), sym("e"), sym("f") }),
+        })),
+        sym("g"),
+      }),
+    })));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/extract_tokens_test.cc
+++ b/test/compiler/prepare_grammar/extract_tokens_test.cc
@ -0,0 +1,276 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/prepare_grammar/interned_grammar.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/prepare_grammar/extract_tokens.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::extract_tokens;
+using prepare_grammar::InternedGrammar;
+using prepare_grammar::InitialSyntaxGrammar;
+
+describe("extract_tokens", []() {
+  it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, repeat1(seq({
+          str("ab"),
+          pattern("cd*"),
+          choice({
+            i_sym(1),
+            i_sym(2),
+            token(repeat1(choice({ str("ef"), str("gh") }))),
+          }),
+        }))},
+        Variable{"rule_B", VariableTypeNamed, pattern("ij+")},
+        Variable{"rule_C", VariableTypeNamed, choice({ str("kl"), blank() })},
+        Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(3))},
+      },
+      {},
+      {},
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+    CompileError error = get<2>(result);
+
+    AssertThat(error, Equals(CompileError::none()));
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable>{
+      Variable{"rule_A", VariableTypeNamed, repeat1(seq({
+
+        // This string is now the first token in the lexical grammar.
+        i_token(0),
+
+        // This pattern is now the second rule in the lexical grammar.
+        i_token(1),
+
+        choice({
+          // Rule 1, which this symbol pointed to, has been moved to the
+          // lexical grammar.
+          i_token(3),
+
+          // This symbol's index has been decremented, because a previous rule
+          // was moved to the lexical grammar.
+          i_sym(1),
+
+          // This token rule is now the third rule in the lexical grammar.
+          i_token(2),
+        }),
+      }))},
+
+      Variable{"rule_C", VariableTypeNamed, choice({ i_token(4), blank() })},
+      Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(2))},
+    }));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
+      // Strings become anonymous rules.
+      LexicalVariable{"ab", VariableTypeAnonymous, str("ab"), true},
+
+      // Patterns become hidden rules.
+      LexicalVariable{"/cd*/", VariableTypeAuxiliary, pattern("cd*"), false},
+
+      // Rules marked as tokens become hidden rules.
+      LexicalVariable{"/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
+        str("ef"),
+        str("gh")
+      })), false},
+
+      // This named rule was moved wholesale to the lexical grammar.
+      LexicalVariable{"rule_B", VariableTypeNamed, pattern("ij+"), false},
+
+      // Strings become anonymous rules.
+      LexicalVariable{"kl", VariableTypeAnonymous, str("kl"), true},
+    })));
+  });
+
+  it("does not create duplicate tokens in the lexical grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, seq({
+          str("ab"),
+          i_sym(0),
+          str("ab"),
+        })},
+      },
+      {},
+      {},
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
+      Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
+    }));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+    }))
+  });
+
+  it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
+    auto result = extract_tokens(InternedGrammar{{
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })},
+      Variable{"rule_B", VariableTypeNamed, str("cd")},
+      Variable{"rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })},
+    }, {}, {}, {}});
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+    LexicalGrammar &lexical_grammar = get<1>(result);
+
+    AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
+      Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })},
+      Variable{"rule_B", VariableTypeNamed, i_token(1)},
+      Variable{"rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })},
+    })));
+
+    AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
+      LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
+      LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
+      LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
+    }));
+  });
+
+  it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, str("ok")},
+        Variable{"rule_B", VariableTypeNamed, repeat(i_sym(0))},
+        Variable{"rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))},
+      },
+      {
+        str(" ")
+      },
+      {
+        { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) }
+      },
+      {}
+    });
+
+    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
+
+    AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
+    AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
+      { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) },
+    })));
+  });
+
+  describe("handling extra tokens", [&]() {
+    it("adds inline extra tokens to the lexical grammar's separators", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+        },
+        {
+          str("y"),
+          pattern("\\s+"),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+
+      AssertThat(get<1>(result).separators.size(), Equals<size_t>(2));
+      AssertThat(get<1>(result).separators[0], EqualsPointer(str("y")));
+      AssertThat(get<1>(result).separators[1], EqualsPointer(pattern("\\s+")));
+
+      AssertThat(get<0>(result).extra_tokens, IsEmpty());
+    });
+
+    it("handles inline extra tokens that match tokens in the grammar", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, str("x")},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+        },
+        {
+          str("y"),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+      AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, Symbol::Terminal) })));
+    });
+
+    it("updates extra symbols according to the new symbol numbers", [&]() {
+      auto result = extract_tokens(InternedGrammar{
+        {
+          Variable{"rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })},
+          Variable{"rule_B", VariableTypeNamed, str("y")},
+          Variable{"rule_C", VariableTypeNamed, str("z")},
+        },
+        {
+          i_sym(2),
+        },
+        {},
+        {}
+      });
+
+      AssertThat(get<2>(result), Equals(CompileError::none()));
+
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
+        { Symbol(3, Symbol::Terminal) },
+      })));
+
+      AssertThat(get<1>(result).separators, IsEmpty());
+    });
+
+    it("returns an error if any extra tokens are non-token symbols", [&]() {
+      auto result = extract_tokens(InternedGrammar{{
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
+      }, { i_sym(1) }, {}, {}});
+
+      AssertThat(get<2>(result), !Equals(CompileError::none()));
+      AssertThat(get<2>(result), Equals(
+        CompileError(TSCompileErrorTypeInvalidExtraToken,
+                         "Not a token: rule_B")));
+    });
+
+    it("returns an error if any extra tokens are non-token rules", [&]() {
+      auto result = extract_tokens(InternedGrammar{{
+        Variable{"rule_A", VariableTypeNamed, str("x")},
+        Variable{"rule_B", VariableTypeNamed, str("y")},
+      }, { choice({ i_sym(1), blank() }) }, {}, {}});
+
+      AssertThat(get<2>(result), !Equals(CompileError::none()));
+      AssertThat(get<2>(result), Equals(CompileError(
+        TSCompileErrorTypeInvalidExtraToken,
+        "Not a token: (choice (non-terminal 1) (blank))"
+      )));
+    });
+  });
+
+  it("returns an error if an external token has the same name as a non-terminal rule", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
+        Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
+      },
+      {},
+      {},
+      {
+        ExternalToken {"rule_A", VariableTypeNamed, Symbol(0, Symbol::NonTerminal)}
+      }
+    });
+
+    AssertThat(get<2>(result), Equals(CompileError(
+      TSCompileErrorTypeInvalidExternalToken,
+      "Name 'rule_A' cannot be used for both an external token and a non-terminal rule"
+    )));
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/flatten_grammar_test.cc
+++ b/test/compiler/prepare_grammar/flatten_grammar_test.cc
@ -0,0 +1,89 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/flatten_grammar.h"
+#include "compiler/prepare_grammar/initial_syntax_grammar.h"
+#include "compiler/syntax_grammar.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::flatten_rule;
+
+describe("flatten_grammar", []() {
+  it("associates each symbol with the precedence and associativity binding it to its successor", [&]() {
+    SyntaxVariable result = flatten_rule(Variable{
+      "test",
+      VariableTypeNamed,
+      seq({
+        i_sym(1),
+        prec_left(101, seq({
+          i_sym(2),
+          choice({
+            prec_right(102, seq({
+              i_sym(3),
+              i_sym(4)
+            })),
+            i_sym(5),
+          }),
+          i_sym(6),
+        })),
+        i_sym(7),
+      })
+    });
+
+    AssertThat(result.name, Equals("test"));
+    AssertThat(result.type, Equals(VariableTypeNamed));
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight},
+        {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
+      }),
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
+      })
+    })))
+  });
+
+  it("uses the last assigned precedence", [&]() {
+    SyntaxVariable result = flatten_rule(Variable{
+      "test1",
+      VariableTypeNamed,
+      prec_left(101, seq({
+        i_sym(1),
+        i_sym(2),
+      }))
+    });
+
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+      })
+    })))
+
+    result = flatten_rule(Variable{
+      "test2",
+      VariableTypeNamed,
+      prec_left(101, seq({
+        i_sym(1),
+      }))
+    });
+
+    AssertThat(result.productions, Equals(vector<Production>({
+      Production({
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+      })
+    })))
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/intern_symbols_test.cc
+++ b/test/compiler/prepare_grammar/intern_symbols_test.cc
@ -0,0 +1,103 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/intern_symbols.h"
+#include "compiler/grammar.h"
+#include "compiler/rules/named_symbol.h"
+#include "compiler/rules/symbol.h"
+#include "compiler/rules/built_in_symbols.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::intern_symbols;
+
+describe("intern_symbols", []() {
+  it("replaces named symbols with numerically-indexed symbols", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("_z") })},
+        {"y", sym("_z")},
+        {"_z", str("stuff")}
+      }, {}, {}, {}
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.second, Equals(CompileError::none()));
+    AssertThat(result.first.variables, Equals(vector<Variable>{
+      Variable{"x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })},
+      Variable{"y", VariableTypeNamed, i_sym(2)},
+      Variable{"_z", VariableTypeHidden, str("stuff")},
+    }));
+  });
+
+  describe("when there are symbols that reference undefined rules", [&]() {
+    it("returns an error", []() {
+      Grammar grammar{
+        {
+          {"x", sym("y")},
+        },
+        {}, {}, {}
+      };
+
+      auto result = intern_symbols(grammar);
+
+      AssertThat(result.second.message, Equals("Undefined rule 'y'"));
+    });
+  });
+
+  it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")}
+      },
+      {
+        sym("z")
+      },
+      {}, {}
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.second, Equals(CompileError::none()));
+    AssertThat(result.first.extra_tokens.size(), Equals<size_t>(1));
+    AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2)));
+  });
+
+  it("records any rule names that match external token names", [&]() {
+    Grammar grammar{
+      {
+        {"x", choice({ sym("y"), sym("z") })},
+        {"y", sym("z")},
+        {"z", str("stuff")},
+      },
+      {},
+      {},
+      {
+        "w",
+        "z"
+      }
+    };
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>{
+      ExternalToken{
+        "w",
+        VariableTypeNamed,
+        rules::NONE()
+      },
+      ExternalToken{
+        "z",
+        VariableTypeNamed,
+        Symbol(2, Symbol::NonTerminal)
+      },
+    }))
+  });
+});
+
+END_TEST
--- a/test/compiler/prepare_grammar/parse_regex_test.cc
+++ b/test/compiler/prepare_grammar/parse_regex_test.cc
@ -0,0 +1,245 @@
+#include "test_helper.h"
+#include "compiler/prepare_grammar/parse_regex.h"
+#include "helpers/equals_pointer.h"
+#include "helpers/rule_helpers.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::parse_regex;
+
+describe("parse_regex", []() {
+  struct ValidInputRow {
+    string description;
+    string pattern;
+    rule_ptr rule;
+  };
+
+  vector<ValidInputRow> valid_inputs = {
+    {
+      "character sets",
+      "[aAeE]",
+      character({ 'a', 'A', 'e', 'E' })
+    },
+
+    {
+      "'.' characters as wildcards",
+      ".",
+      character({ '\n' }, false)
+    },
+
+    {
+      "character classes",
+      "\\w-\\d-\\s-\\W-\\D-\\S",
+      seq({
+        character({
+          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_' }),
+        character({ '-' }),
+        character({ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }),
+        character({ '-' }),
+        character({ ' ', '\t', '\r', '\n' }),
+        character({ '-' }),
+        character({
+          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_' }, false),
+        character({ '-' }),
+        character({ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, false),
+        character({ '-' }),
+        character({ ' ', '\t', '\r', '\n' }, false),
+      })
+    },
+
+    {
+      "choices",
+      "ab|cd|ef",
+      choice({
+        seq({
+          character({ 'a' }),
+          character({ 'b' }) }),
+        seq({
+          character({ 'c' }),
+          character({ 'd' }) }),
+        seq({
+          character({ 'e' }),
+          character({ 'f' }) }) })
+    },
+
+    {
+      "simple sequences",
+      "abc",
+      seq({
+        character({ 'a' }),
+        character({ 'b' }),
+        character({ 'c' }) })
+    },
+
+    {
+      "character ranges",
+      "[12a-dA-D3]",
+      character({
+        '1', '2', '3',
+        'a', 'b', 'c', 'd',
+        'A', 'B', 'C', 'D' })
+    },
+
+    {
+      "negated characters",
+      "[^a\\d]",
+      character({ 'a', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, false)
+    },
+
+    {
+      "backslashes",
+      "\\\\",
+      character({ '\\' })
+    },
+
+    {
+      "character groups in sequences",
+      "x([^x]|\\\\x)*x",
+      seq({
+        character({ 'x' }),
+        repeat(choice({
+          character({ 'x' }, false),
+          seq({ character({ '\\' }), character({ 'x' }) }) })),
+        character({ 'x' }) })
+    },
+
+    {
+      "choices in sequences",
+      "(a|b)cd",
+      seq({
+        choice({
+          character({ 'a' }),
+          character({ 'b' }) }),
+        character({ 'c' }),
+        character({ 'd' }) })
+    },
+
+    {
+      "escaped parentheses",
+      "a\\(b",
+      seq({
+        character({ 'a' }),
+        character({ '(' }),
+        character({ 'b' }) })
+    },
+
+    {
+      "escaped periods",
+      "a\\.",
+      seq({
+        character({ 'a' }),
+        character({ '.' }) })
+    },
+
+    {
+      "escaped characters",
+      "\\t\\n\\r",
+      seq({
+        character({ '\t' }),
+        character({ '\n' }),
+        character({ '\r' }) })
+    },
+
+    {
+      "plus repeats",
+      "(ab)+(cd)+",
+      seq({
+        repeat1(seq({ character({ 'a' }), character({ 'b' }) })),
+        repeat1(seq({ character({ 'c' }), character({ 'd' }) })) })
+    },
+
+    {
+      "asterix repeats",
+      "(ab)*(cd)*",
+      seq({
+        repeat(seq({ character({ 'a' }), character({ 'b' }) })),
+        repeat(seq({ character({ 'c' }), character({ 'd' }) })) })
+    },
+
+    {
+      "optional rules",
+      "a(bc)?",
+      seq({
+        character({ 'a' }),
+        choice({
+          seq({ character({ 'b' }), character({ 'c' }) }),
+          blank() }) })
+    },
+
+    {
+      "choices containing negated character classes",
+      "/([^/]|(\\\\/))*/",
+      seq({
+        character({ '/' }),
+        repeat(choice({
+          character({ '/' }, false),
+          seq({ character({ '\\' }), character({ '/' }) }) })),
+        character({ '/' }), }),
+    },
+  };
+
+  struct InvalidInputRow {
+    string description;
+    string pattern;
+    const char *message;
+  };
+
+  vector<InvalidInputRow> invalid_inputs = {
+    {
+      "mismatched open parens",
+      "(a",
+      "unmatched open paren",
+    },
+    {
+      "mismatched nested open parens",
+      "((a) (b)",
+      "unmatched open paren",
+    },
+    {
+      "mismatched close parens",
+      "a)",
+      "unmatched close paren",
+    },
+    {
+      "mismatched nested close parens",
+      "((a) b))",
+      "unmatched close paren",
+    },
+    {
+      "mismatched brackets for character classes",
+      "[a",
+      "unmatched open square bracket",
+    },
+    {
+      "mismatched brackets for character classes",
+      "a]",
+      "unmatched close square bracket",
+    },
+  };
+
+  for (auto &row : valid_inputs) {
+    it(("parses " + row.description).c_str(), [&]() {
+      auto result = parse_regex(row.pattern);
+      AssertThat(result.first, EqualsPointer(row.rule));
+    });
+  }
+
+  for (auto &row : invalid_inputs) {
+    it(("handles invalid regexes with " + row.description).c_str(), [&]() {
+      auto result = parse_regex(row.pattern);
+      AssertThat(result.second.type, Equals(TSCompileErrorTypeInvalidRegex));
+      AssertThat(result.second.message, Contains(row.message));
+    });
+  }
+});
+
+END_TEST
--- a/test/compiler/rules/character_set_test.cc
+++ b/test/compiler/rules/character_set_test.cc
@ -0,0 +1,337 @@
+#include "test_helper.h"
+#include "compiler/rules/character_set.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("CharacterSet", []() {
+  describe("equality", [&]() {
+    it("returns true for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      AssertThat(set1, Equals(set2));
+    });
+
+    it("returns false for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'c')
+        .include('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include_all()
+        .exclude('a', 'd')
+        .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include_all()
+        .exclude('a', 'c')
+        .exclude('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+  });
+
+  describe("hashing", [&]() {
+    it("returns the same number for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      AssertThat(set1.hash_code(), Equals(set2.hash_code()));
+    });
+
+    it("returns different numbers for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'd')
+        .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include('a', 'c')
+        .include('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+
+    it("returns different numbers for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include_all()
+        .exclude('a', 'd')
+        .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+        .include_all()
+        .exclude('a', 'c')
+        .exclude('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+
+    it("returns different numbers for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
+    });
+  });
+
+  describe("::is_empty", [&]() {
+    it("returns true for empty character sets", [&]() {
+      AssertThat(CharacterSet().is_empty(), Equals(true));
+    });
+
+    it("returns false for full character sets", [&]() {
+      AssertThat(CharacterSet().include_all().is_empty(), Equals(false));
+    });
+
+    it("returns false for character sets that include some characters", [&]() {
+      AssertThat(CharacterSet().include('x').is_empty(), Equals(false));
+    });
+  });
+
+  describe("::include", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("adds included characters", [&]() {
+        CharacterSet set1 = CharacterSet().include('a', 'd');
+        AssertThat(set1, Equals(CharacterSet()
+          .include('a')
+          .include('b')
+          .include('c')
+          .include('d')));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'g')
+          .include('c', 'e');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include_all()
+          .exclude('a')
+          .exclude('b')
+          .exclude('f')
+          .exclude('g')));
+      });
+
+      it("does nothing if the character are already not excluded", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .include('a', 'c');
+
+        AssertThat(set1, Equals(CharacterSet().include_all()));
+      });
+    });
+  });
+
+  describe("::exclude", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("removes included characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include('a', 'g')
+          .exclude('c', 'e');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include('a')
+          .include('b')
+          .include('f')
+          .include('g')));
+      });
+
+      it("does nothing if the character's are already not included", [&]() {
+        CharacterSet set1 = CharacterSet().exclude('a', 'c');
+        AssertThat(set1, Equals(CharacterSet()));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'd');
+
+        AssertThat(set1, Equals(CharacterSet()
+          .include_all()
+          .exclude('a')
+          .exclude('b')
+          .exclude('c')
+          .exclude('d')));
+      });
+    });
+  });
+
+  describe("::remove_set", []() {
+    CharacterSet intersection;
+
+    describe("for a set with whitelisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("removes the included characters that the other set also includes", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(set1, Equals(CharacterSet()
+            .include('a', 'c')
+            .include('t', 'z')));
+        });
+
+        it("returns the characters that were removed", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(intersection, Equals(CharacterSet()
+            .include('d', 's')));
+        });
+
+        it("returns the empty set when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('A', 'Z'));
+          AssertThat(set1, Equals(CharacterSet().include('a', 'z')));
+          AssertThat(intersection, Equals(CharacterSet()));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("removes the included characters that are not excluded by the other set", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('d', 'f')));
+          AssertThat(intersection, Equals(CharacterSet()
+            .include('a', 'c')));
+        });
+      });
+    });
+
+    describe("for a set with blacklisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("adds the subtractend's inclusions to the receiver's exclusions", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include('x', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'f')
+            .exclude('x', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet().include('x', 'z')));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("includes only the characters excluded by the subtractend but not by the receiver", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('n', 'z')));
+        });
+
+        it("returns the characters excluded by neither set", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(intersection, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'z')));
+        });
+
+        it("works when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+            .include_all()
+            .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+            .include('n', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet()
+            .include_all()
+            .exclude('a', 'z')));
+        });
+      });
+    });
+  });
+
+  describe("::included_ranges", [&]() {
+    it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'c')
+        .include('g')
+        .include('z');
+
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+        CharacterRange('a', 'c'),
+        CharacterRange('g'),
+        CharacterRange('z'),
+      })));
+    });
+
+    it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
+      CharacterSet set1 = CharacterSet()
+        .include('a', 'b')
+        .include('g')
+        .include('z');
+
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+        CharacterRange('a'),
+        CharacterRange('b'),
+        CharacterRange('g'),
+        CharacterRange('z'),
+      })));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/rules/choice_test.cc
+++ b/test/compiler/rules/choice_test.cc
@ -0,0 +1,53 @@
+#include "test_helper.h"
+#include "compiler/rules/choice.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/equals_pointer.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("Choice", []() {
+  describe("constructing choices", [&]() {
+    it("eliminates duplicate members", [&]() {
+      auto rule = Choice::build({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+        seq({ sym("one"), sym("two") })
+      });
+
+      AssertThat(rule, EqualsPointer(choice({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+      })));
+    });
+
+    it("eliminates duplicates within nested choices", [&]() {
+      auto rule = Choice::build({
+        seq({ sym("one"), sym("two") }),
+        Choice::build({
+          sym("three"),
+          seq({ sym("one"), sym("two") })
+        })
+      });
+
+      AssertThat(rule, EqualsPointer(choice({
+        seq({ sym("one"), sym("two") }),
+        sym("three"),
+      })));
+    });
+
+    it("doesn't construct a choice if there's only one unique member", [&]() {
+      auto rule = Choice::build({
+        sym("one"),
+        Choice::build({
+          sym("one"),
+        })
+      });
+
+      AssertThat(rule, EqualsPointer(sym("one")));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/rules/repeat_test.cc
+++ b/test/compiler/rules/repeat_test.cc
@ -0,0 +1,22 @@
+#include "test_helper.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/symbol.h"
+
+using namespace rules;
+
+START_TEST
+
+describe("Repeat", []() {
+  describe("constructing repeats", [&]() {
+    it("doesn't create redundant repeats", [&]() {
+      auto sym = make_shared<Symbol>(1, Symbol::NonTerminal);
+      auto repeat = Repeat::build(sym);
+      auto outer_repeat = Repeat::build(repeat);
+
+      AssertThat(repeat, !Equals(sym));
+      AssertThat(outer_repeat, Equals(repeat));
+    });
+  });
+});
+
+END_TEST
--- a/test/compiler/util/string_helpers_test.cc
+++ b/test/compiler/util/string_helpers_test.cc
@ -0,0 +1,26 @@
+#include "test_helper.h"
+#include "compiler/util/string_helpers.h"
+
+using util::escape_char;
+
+START_TEST
+
+describe("escape_char", []() {
+  it("returns ascii characters as strings", [&]() {
+    AssertThat(escape_char('x'), Equals("'x'"));
+  });
+
+  it("escapes special characters with backslashes", [&]() {
+    AssertThat(escape_char('\\'), Equals("'\\\\'"));
+    AssertThat(escape_char('\n'), Equals("'\\n'"));
+    AssertThat(escape_char('\t'), Equals("'\\t'"));
+    AssertThat(escape_char('\r'), Equals("'\\r'"));
+    AssertThat(escape_char('\''), Equals("'\\''"));
+  });
+
+  it("prints non-ascii characters as numbers", [&]() {
+    AssertThat(escape_char(256), Equals("256"));
+  });
+});
+
+END_TEST