Merge pull request #54 from tree-sitter/external-scanners

External scanners
2017-01-31 11:46:49 -08:00 · 2017-01-31 11:46:49 -08:00 · 3edb5dbdd9
commit 3edb5dbdd9
parent 269ee1c120 0a6e5f9ee6
75 changed files with 2162 additions and 1123 deletions
--- a/README.md
+++ b/README.md
@ -176,11 +176,11 @@ tokens, like `(` and `+`. This is useful when analyzing the meaning of a documen
 #include "tree_sitter/runtime.h"

 // Declare the language function that was generated from your grammar.
-TSLanguage *ts_language_arithmetic();
+TSLanguage *tree_sitter_arithmetic();

 int main() {
  TSDocument *document = ts_document_new();
-  ts_document_set_language(document, ts_language_arithmetic());
+  ts_document_set_language(document, tree_sitter_arithmetic());
  ts_document_set_input_string(document, "a + b * 5");
  ts_document_parse(document);

--- a/doc/grammar-schema.json
+++ b/doc/grammar-schema.json
@ -40,6 +40,14 @@
          "pattern": "^[a-zA-Z_]\\w*$"
        }
      }
+    },
+
+    "externals": {
+      "type": "array",
+      "items": {
+        "type": "string",
+        "pattern": "^[a-zA-Z_]\\w*$"
+      }
    }
  },

--- a/include/tree_sitter/compiler.h
+++ b/include/tree_sitter/compiler.h
@ -10,7 +10,8 @@ typedef enum {
  TSCompileErrorTypeInvalidGrammar,
  TSCompileErrorTypeInvalidRegex,
  TSCompileErrorTypeUndefinedSymbol,
-  TSCompileErrorTypeInvalidUbiquitousToken,
+  TSCompileErrorTypeInvalidExtraToken,
+  TSCompileErrorTypeInvalidExternalToken,
  TSCompileErrorTypeLexConflict,
  TSCompileErrorTypeParseConflict,
  TSCompileErrorTypeEpsilonRule,
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -12,6 +12,8 @@ extern "C" {
 typedef unsigned short TSSymbol;
 typedef unsigned short TSStateId;

+typedef uint8_t TSExternalTokenState[16];
+
 #define ts_builtin_sym_error ((TSSymbol)-1)
 #define ts_builtin_sym_end 0

@ -23,7 +25,7 @@ typedef struct {
 } TSSymbolMetadata;

 typedef struct {
-  void (*advance)(void *, TSStateId, bool);
+  void (*advance)(void *, bool);
  int32_t lookahead;
  TSSymbol result_symbol;
 } TSLexer;
@ -48,6 +50,11 @@ typedef struct {
  bool fragile : 1;
 } TSParseAction;

+typedef struct {
+  uint16_t lex_state;
+  uint16_t external_lex_state;
+} TSLexMode;
+
 typedef union {
  TSParseAction action;
  struct {
@ -58,14 +65,26 @@ typedef union {
 } TSParseActionEntry;

 typedef struct TSLanguage {
+  uint32_t version;
  uint32_t symbol_count;
  uint32_t token_count;
+  uint32_t external_token_count;
  const char **symbol_names;
  const TSSymbolMetadata *symbol_metadata;
  const unsigned short *parse_table;
  const TSParseActionEntry *parse_actions;
-  const TSStateId *lex_states;
+  const TSLexMode *lex_modes;
  bool (*lex_fn)(TSLexer *, TSStateId);
+  struct {
+    const bool *states;
+    const TSSymbol *symbol_map;
+    void *(*create)();
+    void (*destroy)(void *);
+    void (*reset)(void *);
+    bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
+    bool (*serialize)(void *, TSExternalTokenState);
+    void (*deserialize)(void *, const TSExternalTokenState);
+  } external_scanner;
 } TSLanguage;

 /*
@ -79,14 +98,14 @@ typedef struct TSLanguage {

 #define ADVANCE(state_value)                   \
  {                                            \
-    lexer->advance(lexer, state_value, false); \
+    lexer->advance(lexer, false); \
    state = state_value;                       \
    goto next_state;                           \
  }

 #define SKIP(state_value)                     \
  {                                           \
-    lexer->advance(lexer, state_value, true); \
+    lexer->advance(lexer, true); \
    state = state_value;                      \
    goto next_state;                          \
  }
@ -146,21 +165,21 @@ typedef struct TSLanguage {
    { .type = TSParseActionTypeAccept } \
  }

-#define EXPORT_LANGUAGE(language_name)                     \
-  static TSLanguage language = {                           \
-    .symbol_count = SYMBOL_COUNT,                          \
-    .token_count = TOKEN_COUNT,                            \
-    .symbol_metadata = ts_symbol_metadata,                 \
-    .parse_table = (const unsigned short *)ts_parse_table, \
-    .parse_actions = ts_parse_actions,                     \
-    .lex_states = ts_lex_states,                           \
-    .symbol_names = ts_symbol_names,                       \
-    .lex_fn = ts_lex,                                      \
-  };                                                       \
-                                                           \
-  const TSLanguage *language_name() {                      \
-    return &language;                                      \
-  }
+#define GET_LANGUAGE(...)                                          \
+  static TSLanguage language = {                                   \
+    .version = LANGUAGE_VERSION,                                   \
+    .symbol_count = SYMBOL_COUNT,                                  \
+    .token_count = TOKEN_COUNT,                                    \
+    .symbol_metadata = ts_symbol_metadata,                         \
+    .parse_table = (const unsigned short *)ts_parse_table,         \
+    .parse_actions = ts_parse_actions,                             \
+    .lex_modes = ts_lex_modes,                                     \
+    .symbol_names = ts_symbol_names,                               \
+    .lex_fn = ts_lex,                                              \
+    .external_token_count = EXTERNAL_TOKEN_COUNT,                  \
+    .external_scanner = {__VA_ARGS__}                              \
+  };                                                               \
+  return &language                                                 \

 #ifdef __cplusplus
 }
--- a/include/tree_sitter/runtime.h
+++ b/include/tree_sitter/runtime.h
@ -9,6 +9,8 @@ extern "C" {
 #include <stdint.h>
 #include <stdbool.h>

+#define TREE_SITTER_LANGUAGE_VERSION 1
+
 typedef unsigned short TSSymbol;
 typedef struct TSLanguage TSLanguage;
 typedef struct TSDocument TSDocument;
@ -114,6 +116,7 @@ uint32_t ts_document_parse_count(const TSDocument *);

 uint32_t ts_language_symbol_count(const TSLanguage *);
 const char *ts_language_symbol_name(const TSLanguage *, TSSymbol);
+uint32_t ts_language_version(const TSLanguage *);

 #ifdef __cplusplus
 }
--- a/script/fetch-fixtures
+++ b/script/fetch-fixtures
@ -7,6 +7,7 @@ GRAMMARS=(
  json
  c
  cpp
+  python
 )

 for grammar in ${GRAMMARS[@]}; do
@ -21,7 +22,7 @@ for grammar in ${GRAMMARS[@]}; do

  (
    cd $grammar_dir;
-    git reset --hard;
-    git pull origin master;
+    git fetch origin
+    git reset --hard origin/master;
  )
 done
--- a/spec/compiler/build_tables/distinctive_tokens_spec.cc
+++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc
@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() {
      })),
    };

-    AssertThat(recovery_tokens(grammar), Equals<set<Symbol::Index>>({ 1 }));
+    AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
  });
 });

--- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc
+++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc
@ -14,10 +14,10 @@ START_TEST
 describe("LexConflictManager::resolve(new_action, old_action)", []() {
  LexConflictManager conflict_manager;
  bool update;
-  Symbol sym1(0, true);
-  Symbol sym2(1, true);
-  Symbol sym3(2, true);
-  Symbol sym4(3, true);
+  Symbol sym1(0, Symbol::Terminal);
+  Symbol sym2(1, Symbol::Terminal);
+  Symbol sym3(2, Symbol::Terminal);
+  Symbol sym4(3, Symbol::Terminal);
  LexItemSet item_set({ LexItem(sym4, blank() )});

  it("favors advance actions over empty accept token actions", [&]() {
--- a/spec/compiler/build_tables/lex_item_spec.cc
+++ b/spec/compiler/build_tables/lex_item_spec.cc
@ -14,7 +14,7 @@ START_TEST
 describe("LexItem", []() {
  describe("completion_status()", [&]() {
    it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
-      LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' }));
+      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
      AssertThat(item1.completion_status().is_done, IsFalse());
      AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
      AssertThat(item1.completion_status().is_string, IsFalse());
@ -23,7 +23,7 @@ describe("LexItem", []() {
      params.precedence = 3;
      params.has_precedence = true;
      params.is_string = 1;
-      LexItem item2(Symbol(0, true), choice({
+      LexItem item2(Symbol(0, Symbol::Terminal), choice({
        metadata(blank(), params),
        character({ 'a', 'b', 'c' })
      }));
@ -32,7 +32,7 @@ describe("LexItem", []() {
      AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
      AssertThat(item2.completion_status().is_string, IsTrue());

-      LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' })));
+      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
      AssertThat(item3.completion_status().is_done, IsTrue());
      AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
      AssertThat(item3.completion_status().is_string, IsFalse());
@ -43,7 +43,7 @@ describe("LexItem", []() {
 describe("LexItemSet::transitions()", [&]() {
  it("handles single characters", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), character({ 'x' })),
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
    });

    AssertThat(
@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('x'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), blank()),
+              LexItem(Symbol(1, Symbol::NonTerminal), blank()),
            }),
            PrecedenceRange(),
            false
@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() {
    params.is_main_token = true;

    LexItemSet item_set({
-      LexItem(Symbol(1), metadata(character({ 'x' }), params)),
+      LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)),
    });

    AssertThat(
@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('x'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), metadata(blank(), params)),
+              LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)),
            }),
            PrecedenceRange(),
            true
@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles sequences", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
        character({ 'w' }),
        character({ 'x' }),
        character({ 'y' }),
@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('w'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                character({ 'x' }),
                character({ 'y' }),
                character({ 'z' }),
@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles sequences with nested precedence", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
        prec(3, seq({
          character({ 'v' }),
          prec(4, seq({
@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() {
            // The outer precedence is now 'active', because we are within its
            // contained rule.
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                active_prec(3, seq({
                  prec(4, seq({
                    character({ 'w' }),
@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() {
          Transition{
            // The inner precedence is now 'active'
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                active_prec(3, seq({
                  active_prec(4, character({ 'x' })),
                  character({ 'y' }) })),
@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('x'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                active_prec(3, character({ 'y' })),
                character({ 'z' }),
              })),
@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('y'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ 'z' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
            }),
            PrecedenceRange(3),
            false
@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles sequences where the left hand side can be blank", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
        choice({
          character({ 'x' }),
          blank(),
@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('x'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                character({ 'y' }),
                character({ 'z' }),
              })),
@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('y'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ 'z' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
            }),
            PrecedenceRange(),
            false
@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles blanks", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), blank()),
+      LexItem(Symbol(1, Symbol::NonTerminal), blank()),
    });

    AssertThat(item_set.transitions(), IsEmpty());
@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles repeats", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), repeat1(seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({
        character({ 'a' }),
        character({ 'b' }),
      }))),
-      LexItem(Symbol(2), repeat1(character({ 'c' }))),
+      LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
    });

    AssertThat(
@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('a'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                character({ 'b' }),
                repeat1(seq({
                  character({ 'a' }),
                  character({ 'b' }),
                }))
              })),
-              LexItem(Symbol(1), character({ 'b' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })),
            }),
            PrecedenceRange(),
            false
@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('c'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(2), repeat1(character({ 'c' }))),
-              LexItem(Symbol(2), blank()),
+              LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+              LexItem(Symbol(2, Symbol::NonTerminal), blank()),
            }),
            PrecedenceRange(),
            false
@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles repeats with precedence", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' }))))
+      LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' }))))
    });

    AssertThat(
@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('a'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))),
-              LexItem(Symbol(1), active_prec(-1, blank())),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())),
            }),
            PrecedenceRange(-1),
            false
@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles choices between overlapping character sets", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), choice({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
        active_prec(2, seq({
          character({ 'a', 'b', 'c', 'd'  }),
          character({ 'x' }),
@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('a', 'b'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
            }),
            PrecedenceRange(2),
            false
@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('c', 'd'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
-              LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
            }),
            PrecedenceRange(2, 3),
            false
@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('e', 'f'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
            }),
            PrecedenceRange(3),
            false
@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles choices between a subset and a superset of characters", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), choice({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
        seq({
          character({ 'b', 'c', 'd' }),
          character({ 'x' }),
@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('a').include('e', 'f'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ 'y' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
            }),
            PrecedenceRange(),
            false
@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('b', 'd'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ 'x' })),
-              LexItem(Symbol(1), character({ 'y' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
            }),
            PrecedenceRange(),
            false
@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles choices between whitelisted and blacklisted character sets", [&]() {
    LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
        choice({
          character({ '/' }, false),
          seq({
@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include_all().exclude('/').exclude('\\'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
            }),
            PrecedenceRange(),
            false
@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() {
          CharacterSet().include('\\'),
          Transition{
            LexItemSet({
-              LexItem(Symbol(1), character({ '/' })),
-              LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })),
            }),
            PrecedenceRange(),
            false
@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() {

  it("handles different items with overlapping character sets", [&]() {
    LexItemSet set1({
-      LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
-      LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' }))
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
+      LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' }))
    });

    AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({
@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() {
        CharacterSet().include('a', 'd'),
        Transition{
          LexItemSet({
-            LexItem(Symbol(1), blank()),
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
          }),
          PrecedenceRange(),
          false
@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() {
        CharacterSet().include('e', 'f'),
        Transition{
          LexItemSet({
-            LexItem(Symbol(1), blank()),
-            LexItem(Symbol(2), blank()),
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
          }),
          PrecedenceRange(),
          false
@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() {
        CharacterSet().include('g', 'i'),
        Transition{
          LexItemSet({
-            LexItem(Symbol(2), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
          }),
          PrecedenceRange(),
          false
--- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc
+++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc
@ -27,26 +27,26 @@ describe("ParseItemSetBuilder", []() {
    SyntaxGrammar grammar{{
      SyntaxVariable("rule0", VariableTypeNamed, {
        Production({
-          {Symbol(1), 0, AssociativityNone},
-          {Symbol(11, true), 0, AssociativityNone},
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
        }),
      }),
      SyntaxVariable("rule1", VariableTypeNamed, {
        Production({
-          {Symbol(12, true), 0, AssociativityNone},
-          {Symbol(13, true), 0, AssociativityNone},
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
        }),
        Production({
-          {Symbol(2), 0, AssociativityNone},
+          {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
        })
      }),
      SyntaxVariable("rule2", VariableTypeNamed, {
        Production({
-          {Symbol(14, true), 0, AssociativityNone},
-          {Symbol(15, true), 0, AssociativityNone},
+          {Symbol(14, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(15, Symbol::Terminal), 0, AssociativityNone},
        })
      }),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto production = [&](int variable_index, int production_index) -> const Production & {
      return grammar.variables[variable_index].productions[production_index];
@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() {

    ParseItemSet item_set({
      {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 }),
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
      }
    });

@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() {

    AssertThat(item_set, Equals(ParseItemSet({
      {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 })
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+        },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
      },
      {
-        ParseItem(Symbol(1), production(1, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
      },
      {
-        ParseItem(Symbol(1), production(1, 1), 0),
-        LookaheadSet({ 11 })
-      },
-      {
-        ParseItem(Symbol(2), production(2, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
      },
    })));
  });
@ -86,18 +86,18 @@ describe("ParseItemSetBuilder", []() {
    SyntaxGrammar grammar{{
      SyntaxVariable("rule0", VariableTypeNamed, {
        Production({
-          {Symbol(1), 0, AssociativityNone},
-          {Symbol(11, true), 0, AssociativityNone},
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
        }),
      }),
      SyntaxVariable("rule1", VariableTypeNamed, {
        Production({
-          {Symbol(12, true), 0, AssociativityNone},
-          {Symbol(13, true), 0, AssociativityNone},
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
        }),
        Production({})
      }),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto production = [&](int variable_index, int production_index) -> const Production & {
      return grammar.variables[variable_index].productions[production_index];
@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() {

    ParseItemSet item_set({
      {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 }),
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
      }
    });

@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() {

    AssertThat(item_set, Equals(ParseItemSet({
      {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 })
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
      },
      {
-        ParseItem(Symbol(1), production(1, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
      },
      {
-        ParseItem(Symbol(1), production(1, 1), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
      },
    })));
  });
--- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc
@ -13,7 +13,7 @@ describe("expand_repeats", []() {
  it("replaces repeat rules with pairs of recursive rules", [&]() {
    InitialSyntaxGrammar grammar{{
      Variable("rule0", VariableTypeNamed, repeat1(i_token(0))),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

@ -32,7 +32,7 @@ describe("expand_repeats", []() {
        i_token(10),
        repeat1(i_token(11)),
      })),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

@ -54,7 +54,7 @@ describe("expand_repeats", []() {
        i_token(10),
        repeat1(i_token(11))
      })),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

@ -80,7 +80,7 @@ describe("expand_repeats", []() {
        i_token(3),
        repeat1(i_token(4))
      })),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

@ -106,7 +106,7 @@ describe("expand_repeats", []() {
        repeat1(i_token(10)),
        repeat1(i_token(11)),
      })),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

@ -130,7 +130,7 @@ describe("expand_repeats", []() {
    InitialSyntaxGrammar grammar{{
      Variable("rule0", VariableTypeNamed, repeat1(i_token(10))),
      Variable("rule1", VariableTypeNamed, repeat1(i_token(11))),
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = expand_repeats(grammar);

--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -5,6 +5,7 @@
 #include "compiler/prepare_grammar/extract_tokens.h"
 #include "helpers/rule_helpers.h"
 #include "helpers/equals_pointer.h"
+#include "helpers/stream_methods.h"

 START_TEST

@ -28,7 +29,7 @@ describe("extract_tokens", []() {
      Variable("rule_B", VariableTypeNamed, pattern("ij+")),
      Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
      Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
-    }, {}, {}});
+    }, {}, {}, {}});

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);
@ -91,7 +92,7 @@ describe("extract_tokens", []() {
        i_sym(0),
        str("ab"),
      })),
-    }, {}, {}});
+    }, {}, {}, {}});

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);
@ -110,7 +111,7 @@ describe("extract_tokens", []() {
      Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })),
      Variable("rule_B", VariableTypeNamed, str("cd")),
      Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })),
-    }, {}, {}});
+    }, {}, {}, {}});

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);
    LexicalGrammar &lexical_grammar = get<1>(result);
@ -129,17 +130,26 @@ describe("extract_tokens", []() {
  });

  it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
-    auto result = extract_tokens(InternedGrammar{{
-      Variable("rule_A", VariableTypeNamed, str("ok")),
-      Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
-      Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
-    }, { str(" ") }, { { Symbol(1), Symbol(2) } }});
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable("rule_A", VariableTypeNamed, str("ok")),
+        Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
+        Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
+      },
+      {
+        str(" ")
+      },
+      {
+        { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) }
+      },
+      {}
+    });

    InitialSyntaxGrammar &syntax_grammar = get<0>(result);

    AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
    AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
-      { Symbol(0), Symbol(1) },
+      { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) },
    })));
  });

@ -150,7 +160,7 @@ describe("extract_tokens", []() {
      }, {
        str("y"),
        pattern("\\s+"),
-      }, {}});
+      }, {}, {}});

      AssertThat(get<2>(result), Equals(CompileError::none()));

@ -167,11 +177,11 @@ describe("extract_tokens", []() {
        Variable("rule_B", VariableTypeNamed, str("y")),
      }, {
        str("y"),
-      }, {}});
+      }, {}, {}});

      AssertThat(get<2>(result), Equals(CompileError::none()));
      AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
-      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, true) })));
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, Symbol::Terminal) })));
    });

    it("updates extra symbols according to the new symbol numbers", [&]() {
@ -181,12 +191,12 @@ describe("extract_tokens", []() {
        Variable("rule_C", VariableTypeNamed, str("z")),
      }, {
        i_sym(2),
-      }, {}});
+      }, {}, {}});

      AssertThat(get<2>(result), Equals(CompileError::none()));

      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
-        { Symbol(3, true) },
+        { Symbol(3, Symbol::Terminal) },
      })));

      AssertThat(get<1>(result).separators, IsEmpty());
@ -196,11 +206,11 @@ describe("extract_tokens", []() {
      auto result = extract_tokens(InternedGrammar{{
        Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
        Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
-      }, { i_sym(1) }, {}});
+      }, { i_sym(1) }, {}, {}});

      AssertThat(get<2>(result), !Equals(CompileError::none()));
      AssertThat(get<2>(result), Equals(
-        CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
+        CompileError(TSCompileErrorTypeInvalidExtraToken,
                         "Not a token: rule_B")));
    });

@ -208,14 +218,34 @@ describe("extract_tokens", []() {
      auto result = extract_tokens(InternedGrammar{{
        Variable("rule_A", VariableTypeNamed, str("x")),
        Variable("rule_B", VariableTypeNamed, str("y")),
-      }, { choice({ i_sym(1), blank() }) }, {}});
+      }, { choice({ i_sym(1), blank() }) }, {}, {}});

      AssertThat(get<2>(result), !Equals(CompileError::none()));
-      AssertThat(get<2>(result), Equals(
-        CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
-                         "Not a token: (choice (sym 1) (blank))")));
+      AssertThat(get<2>(result), Equals(CompileError(
+        TSCompileErrorTypeInvalidExtraToken,
+        "Not a token: (choice (non-terminal 1) (blank))"
+      )));
    });
  });
+
+  it("returns an error if an external token has the same name as a non-terminal rule", [&]() {
+    auto result = extract_tokens(InternedGrammar{
+      {
+        Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
+        Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
+      },
+      {},
+      {},
+      {
+        ExternalToken {"rule_A", VariableTypeNamed, Symbol(0, Symbol::NonTerminal)}
+      }
+    });
+
+    AssertThat(get<2>(result), Equals(CompileError(
+      TSCompileErrorTypeInvalidExternalToken,
+      "Name 'rule_A' cannot be used for both an external token and a non-terminal rule"
+    )));
+  });
 });

 END_TEST
--- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
+++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
@ -36,19 +36,19 @@ describe("flatten_grammar", []() {
    AssertThat(result.type, Equals(VariableTypeNamed));
    AssertThat(result.productions, Equals(vector<Production>({
      Production({
-        {Symbol(1), 0, AssociativityNone},
-        {Symbol(2), 101, AssociativityLeft},
-        {Symbol(3), 102, AssociativityRight},
-        {Symbol(4), 101, AssociativityLeft},
-        {Symbol(6), 0, AssociativityNone},
-        {Symbol(7), 0, AssociativityNone},
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight},
+        {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
      }),
      Production({
-        {Symbol(1), 0, AssociativityNone},
-        {Symbol(2), 101, AssociativityLeft},
-        {Symbol(5), 101, AssociativityLeft},
-        {Symbol(6), 0, AssociativityNone},
-        {Symbol(7), 0, AssociativityNone},
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
      })
    })))
  });
@ -65,8 +65,8 @@ describe("flatten_grammar", []() {

    AssertThat(result.productions, Equals(vector<Production>({
      Production({
-        {Symbol(1), 101, AssociativityLeft},
-        {Symbol(2), 101, AssociativityLeft},
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
      })
    })))

@ -80,7 +80,7 @@ describe("flatten_grammar", []() {

    AssertThat(result.productions, Equals(vector<Production>({
      Production({
-        {Symbol(1), 101, AssociativityLeft},
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
      })
    })))
  });
--- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc
+++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc
@ -3,8 +3,10 @@
 #include "compiler/grammar.h"
 #include "compiler/rules/named_symbol.h"
 #include "compiler/rules/symbol.h"
+#include "compiler/rules/built_in_symbols.h"
 #include "helpers/equals_pointer.h"
 #include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"

 START_TEST

@ -17,7 +19,7 @@ describe("intern_symbols", []() {
      { "x", choice({ sym("y"), sym("_z") }) },
      { "y", sym("_z") },
      { "_z", str("stuff") }
-    }, {}, {}};
+    }, {}, {}, {}};

    auto result = intern_symbols(grammar);

@ -33,7 +35,7 @@ describe("intern_symbols", []() {
    it("returns an error", []() {
      Grammar grammar{{
        { "x", sym("y") },
-      }, {}, {}};
+      }, {}, {}, {}};

      auto result = intern_symbols(grammar);

@ -48,7 +50,7 @@ describe("intern_symbols", []() {
      { "z", str("stuff") }
    }, {
      sym("z")
-    }, {}};
+    }, {}, {}};

    auto result = intern_symbols(grammar);

@ -56,6 +58,32 @@ describe("intern_symbols", []() {
    AssertThat(result.first.extra_tokens.size(), Equals<size_t>(1));
    AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2)));
  });
+
+  it("records any rule names that match external token names", [&]() {
+    Grammar grammar{{
+      { "x", choice({ sym("y"), sym("z") }) },
+      { "y", sym("z") },
+      { "z", str("stuff") }
+    }, {}, {}, {
+      "w",
+      "z"
+    }};
+
+    auto result = intern_symbols(grammar);
+
+    AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>({
+      {
+        "w",
+        VariableTypeNamed,
+        rules::NONE()
+      },
+      {
+        "z",
+        VariableTypeNamed,
+        Symbol(2, Symbol::NonTerminal)
+      }
+    })))
+  });
 });

 END_TEST
--- a/spec/compiler/rules/repeat_spec.cc
+++ b/spec/compiler/rules/repeat_spec.cc
@ -9,7 +9,7 @@ START_TEST
 describe("Repeat", []() {
  describe("constructing repeats", [&]() {
    it("doesn't create redundant repeats", [&]() {
-      auto sym = make_shared<Symbol>(1);
+      auto sym = make_shared<Symbol>(1, Symbol::NonTerminal);
      auto repeat = Repeat::build(sym);
      auto outer_repeat = Repeat::build(repeat);

--- a/spec/fixtures/error_corpus/python_errors.txt
+++ b/spec/fixtures/error_corpus/python_errors.txt
@ -0,0 +1,29 @@
+==========================================
+errors in if statements
+==========================================
+
+if a is:
+    print b
+    print c
+
+---
+
+(module
+  (if_statement (identifier) (ERROR)
+    (print_statement (identifier))
+    (print_statement (identifier))))
+
+==========================================
+errors in function definitions
+==========================================
+
+def a()::
+  b
+  c
+
+---
+
+(module
+  (function_definition (identifier) (parameters) (ERROR)
+    (expression_statement (identifier))
+    (expression_statement (identifier))))
--- a/spec/fixtures/external_scanners/extra_external_tokens.c
+++ b/spec/fixtures/external_scanners/extra_external_tokens.c
@ -0,0 +1,42 @@
+#include <tree_sitter/parser.h>
+
+enum {
+  COMMENT,
+};
+
+void *tree_sitter_extra_external_tokens_external_scanner_create() {
+  return NULL;
+}
+
+void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) {
+}
+
+bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
+  return true;
+}
+
+void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
+}
+
+bool tree_sitter_extra_external_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+
+  while (lexer->lookahead == ' ') {
+    lexer->advance(lexer, true);
+  }
+
+  if (lexer->lookahead == '#') {
+    lexer->advance(lexer, false);
+    while (lexer->lookahead != '\n') {
+      lexer->advance(lexer, false);
+    }
+
+    lexer->result_symbol = COMMENT;
+    return true;
+  }
+
+  return false;
+}
+
+void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) {
+}
--- a/spec/fixtures/external_scanners/percent_strings.c
+++ b/spec/fixtures/external_scanners/percent_strings.c
@ -0,0 +1,118 @@
+#include <stdbool.h>
+#include <tree_sitter/parser.h>
+
+enum {
+  percent_string,
+  percent_string_start,
+  percent_string_end
+};
+
+typedef struct {
+  int32_t open_delimiter;
+  int32_t close_delimiter;
+  uint32_t depth;
+} Scanner;
+
+void *tree_sitter_external_scanner_example_external_scanner_create() {
+  Scanner *scanner = malloc(sizeof(Scanner));
+  *scanner = (Scanner){
+    .open_delimiter = 0,
+    .close_delimiter = 0,
+    .depth = 0
+  };
+  return scanner;
+}
+
+bool tree_sitter_external_scanner_example_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+  Scanner *scanner = payload;
+
+  if (whitelist[percent_string]) {
+    while (lexer->lookahead == ' ' ||
+           lexer->lookahead == '\t' ||
+           lexer->lookahead == '\n') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead != '%') return false;
+    lexer->advance(lexer, false);
+
+    switch (lexer->lookahead) {
+      case '(':
+        scanner->open_delimiter = '(';
+        scanner->close_delimiter = ')';
+        scanner->depth = 1;
+        break;
+      case '[':
+        scanner->open_delimiter = '[';
+        scanner->close_delimiter = ']';
+        scanner->depth = 1;
+        break;
+      case '{':
+        scanner->open_delimiter = '{';
+        scanner->close_delimiter = '}';
+        scanner->depth = 1;
+        break;
+      default:
+        return false;
+    }
+
+    lexer->advance(lexer, false);
+
+    for (;;) {
+      if (scanner->depth == 0) {
+        lexer->result_symbol = percent_string;
+        return true;
+      }
+
+      if (lexer->lookahead == scanner->open_delimiter) {
+        scanner->depth++;
+      } else if (lexer->lookahead == scanner->close_delimiter) {
+        scanner->depth--;
+      } else if (lexer->lookahead == '#') {
+        lexer->advance(lexer, false);
+        if (lexer->lookahead == '{') {
+          lexer->advance(lexer, false);
+          lexer->result_symbol = percent_string_start;
+          return true;
+        }
+      }
+
+      lexer->advance(lexer, false);
+    }
+  } else if (whitelist[percent_string_end]) {
+    if (lexer->lookahead != '}') return false;
+    lexer->advance(lexer, false);
+
+    for (;;) {
+      if (scanner->depth == 0) {
+        lexer->result_symbol = percent_string_end;
+        return true;
+      }
+
+      if (lexer->lookahead == scanner->open_delimiter) {
+        scanner->depth++;
+      } else if (lexer->lookahead == scanner->close_delimiter) {
+        scanner->depth--;
+      }
+
+      lexer->advance(lexer, false);
+    }
+  }
+
+  return false;
+}
+
+void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) {
+}
+
+bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) {
+  return true;
+}
+
+void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
+}
+
+void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) {
+  free(payload);
+}
--- a/spec/fixtures/external_scanners/shared_external_tokens.c
+++ b/spec/fixtures/external_scanners/shared_external_tokens.c
@ -0,0 +1,63 @@
+#include <stdbool.h>
+#include <tree_sitter/parser.h>
+
+enum {
+  STRING,
+  LINE_BREAK
+};
+
+void *tree_sitter_shared_external_tokens_external_scanner_create() {
+  return NULL;
+}
+
+void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) {
+}
+
+bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
+  return true;
+}
+
+void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
+}
+
+bool tree_sitter_shared_external_tokens_external_scanner_scan(
+  void *payload, TSLexer *lexer, const bool *whitelist) {
+
+  // If a line-break is a valid lookahead token, only skip spaces.
+  if (whitelist[LINE_BREAK]) {
+    while (lexer->lookahead == ' ') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead == '\n') {
+      lexer->advance(lexer, false);
+      lexer->result_symbol = LINE_BREAK;
+      return true;
+    }
+  }
+
+  // If a line-break is not a valid lookahead token, skip line breaks as well
+  // as spaces.
+  if (whitelist[STRING]) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\n') {
+      lexer->advance(lexer, true);
+    }
+
+    if (lexer->lookahead == '\'') {
+      lexer->advance(lexer, false);
+
+      while (lexer->lookahead != '\'') {
+        lexer->advance(lexer, false);
+      }
+
+      lexer->advance(lexer, false);
+      lexer->result_symbol = STRING;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) {
+}
--- a/spec/helpers/dedent.h
+++ b/spec/helpers/dedent.h
@ -0,0 +1,12 @@
+#include "compiler/util/string_helpers.h"
+#include <string>
+
+static std::string dedent(std::string input) {
+  size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n");
+  std::string whitespace = "\n" + std::string(indent_level, ' ');
+  tree_sitter::util::str_replace(&input, whitespace, "\n");
+  return input.substr(
+    input.find_first_not_of("\n "),
+    input.find_last_not_of("\n ") + 1
+  );
+}
--- a/spec/helpers/load_language.cc
+++ b/spec/helpers/load_language.cc
@ -28,10 +28,11 @@ const char *libcompiler_path =
  "out/Test/libcompiler.a";
 #endif

-static std::string run_cmd(const char *cmd, const char *args[]) {
+static std::string run_command(const char *cmd, const char *args[]) {
  int child_pid = fork();
-  if (child_pid < 0)
+  if (child_pid < 0) {
    return "fork failed";
+  }

  if (child_pid == 0) {
    close(0);
@ -39,7 +40,6 @@ static std::string run_cmd(const char *cmd, const char *args[]) {
    dup2(2, 1);
    dup2(1, 2);
    execvp(cmd, (char * const * )args);
-    return "";
  }

  int status;
@ -47,12 +47,16 @@ static std::string run_cmd(const char *cmd, const char *args[]) {
    waitpid(child_pid, &status, 0);
  } while (!WIFEXITED(status));

-  if (WEXITSTATUS(status) == 0)
+  if (WEXITSTATUS(status) == 0) {
    return "";
-  else
+  } else {
    return "command failed";
+  }
+}

-  return "";
+static bool file_exists(const string &path) {
+  struct stat file_stat;
+  return stat(path.c_str(), &file_stat) == 0;
 }

 static int get_modified_time(const string &path) {
@ -67,46 +71,46 @@ static int get_modified_time(const string &path) {

 const TSLanguage *load_language(const string &source_filename,
                                const string &lib_filename,
-                                const string &language_name) {
-  string language_function_name = "ts_language_" + language_name;
+                                const string &language_name,
+                                string external_scanner_filename = "") {
+  string language_function_name = "tree_sitter_" + language_name;
  string header_dir = getenv("PWD") + string("/include");
  int source_mtime = get_modified_time(source_filename);
  int header_mtime = get_modified_time(header_dir + "/tree_sitter/parser.h");
  int lib_mtime = get_modified_time(lib_filename);
+  int external_scanner_mtime = get_modified_time(external_scanner_filename);

-  if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime) {
-    string obj_filename = lib_filename + ".o";
-    const char *compiler_name = getenv("CC");
-    if (!compiler_name) {
-      compiler_name = "gcc";
-    }
+  if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime ||
+      lib_mtime < external_scanner_mtime) {
+    const char *compiler_name = getenv("CXX");
+    if (!compiler_name) compiler_name = "c++";

-    const char *compile_argv[] = {
-      compiler_name,
-      "-x", "c",
-      "-fPIC",
-      "-g",
-      "-I", header_dir.c_str(),
-      "-c", source_filename.c_str(),
-      "-o", obj_filename.c_str(),
-      NULL
-    };
-    string compile_error = run_cmd("gcc", compile_argv);
-    if (!compile_error.empty()) {
-      AssertThat(string(compile_error), IsEmpty());
-      return nullptr;
-    }
-
-    const char *link_argv[] = {
+    vector<const char *> compile_args = {
      compiler_name,
      "-shared",
-      "-Wl", obj_filename.c_str(),
+      "-fPIC",
+      "-I", header_dir.c_str(),
      "-o", lib_filename.c_str(),
-      NULL
+      "-x", "c",
+      source_filename.c_str()
    };
-    string link_error = run_cmd("gcc", link_argv);
-    if (!link_error.empty()) {
-      AssertThat(link_error, IsEmpty());
+
+    if (!external_scanner_filename.empty()) {
+      compile_args.push_back("-g");
+      string extension = external_scanner_filename.substr(external_scanner_filename.rfind("."));
+      if (extension == ".c") {
+        compile_args.push_back("-xc");
+      } else {
+        compile_args.push_back("-xc++");
+      }
+      compile_args.push_back(external_scanner_filename.c_str());
+    }
+
+    compile_args.push_back(nullptr);
+
+    string compile_error = run_command(compiler_name, compile_args.data());
+    if (!compile_error.empty()) {
+      AssertThat(string(compile_error), IsEmpty());
      return nullptr;
    }
  }
@ -118,19 +122,19 @@ const TSLanguage *load_language(const string &source_filename,
    return nullptr;
  }

-  void *symbol_value = dlsym(parser_lib, language_function_name.c_str());
-  if (!symbol_value) {
+  void *language_function = dlsym(parser_lib, language_function_name.c_str());
+  if (!language_function) {
    std::string message(dlerror());
    AssertThat(message, IsEmpty());
    return nullptr;
  }

-  typedef TSLanguage * (* LanguageFunction)();
-  LanguageFunction language_fn = reinterpret_cast<LanguageFunction>(symbol_value);
-  return language_fn();
+  return reinterpret_cast<TSLanguage *(*)()>(language_function)();
 }

-const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) {
+const TSLanguage *load_compile_result(const string &name,
+                                      const TSCompileResult &compile_result,
+                                      string external_scanner_path) {
  if (compile_result.error_type != TSCompileErrorTypeNone) {
    Assert::Failure(string("Compilation failed ") + compile_result.error_message);
    return nullptr;
@ -146,7 +150,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult
  source_file << compile_result.code;
  source_file.close();

-  const TSLanguage *language = load_language(source_filename, lib_filename, name);
+  auto language = load_language(source_filename, lib_filename, name, external_scanner_path);
  free(compile_result.code);
  return language;
 }
@ -158,6 +162,10 @@ const TSLanguage *get_test_language(const string &language_name) {
  string language_dir = string("spec/fixtures/grammars/") + language_name;
  string grammar_filename = language_dir + "/src/grammar.json";
  string parser_filename = language_dir + "/src/parser.c";
+  string external_scanner_filename = language_dir + "/src/scanner.cc";
+  if (!file_exists(external_scanner_filename)) {
+    external_scanner_filename = "";
+  }

  int grammar_mtime = get_modified_time(grammar_filename);
  if (!grammar_mtime)
@ -192,7 +200,7 @@ const TSLanguage *get_test_language(const string &language_name) {

  mkdir("out/tmp", 0777);
  string lib_filename = "out/tmp/" + language_name + ".so";
-  const TSLanguage *language = load_language(parser_filename, lib_filename, language_name);
+  const TSLanguage *language = load_language(parser_filename, lib_filename, language_name, external_scanner_filename);
  loaded_languages[language_name] = language;
  return language;
 };
--- a/spec/helpers/load_language.h
+++ b/spec/helpers/load_language.h
@ -5,7 +5,8 @@
 #include "tree_sitter/runtime.h"
 #include <string>

-const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &);
+const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &,
+                                      std::string external_scanner_path = "");
 const TSLanguage *get_test_language(const std::string &language_name);

 #endif  // HELPERS_LOAD_LANGUAGE_H_
--- a/spec/helpers/point_helpers.cc
+++ b/spec/helpers/point_helpers.cc
@ -15,7 +15,9 @@ bool operator==(const TSRange &left, const TSRange &right) {
 }

 bool operator==(const Length &left, const Length &right) {
-  return length_eq(left, right);
+  return left.bytes == right.bytes &&
+    left.chars == right.chars &&
+    left.extent == right.extent;
 }

 bool operator<(const TSPoint &left, const TSPoint &right) {
--- a/spec/helpers/rule_helpers.cc
+++ b/spec/helpers/rule_helpers.cc
@ -9,6 +9,7 @@ namespace tree_sitter {
  using std::ostream;
  using std::string;
  using std::to_string;
+  using rules::Symbol;

  rule_ptr character(const set<uint32_t> &ranges) {
    return character(ranges, true);
@ -28,11 +29,11 @@ namespace tree_sitter {
  }

  rule_ptr i_sym(size_t index) {
-    return make_shared<rules::Symbol>(index);
+    return make_shared<Symbol>(index, Symbol::NonTerminal);
  }

  rule_ptr i_token(size_t index) {
-    return make_shared<rules::Symbol>(index, true);
+    return make_shared<Symbol>(index, Symbol::Terminal);
  }

  rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) {
--- a/spec/helpers/scope_sequence.cc
+++ b/spec/helpers/scope_sequence.cc
@ -23,20 +23,21 @@ static void append_to_scope_sequence(ScopeSequence *sequence,
                                     ScopeStack *current_scopes,
                                     TSNode node, TSDocument *document,
                                     const std::string &text) {
-  append_text_to_scope_sequence(sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size());
+  append_text_to_scope_sequence(
+    sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size()
+  );

-  string scope = ts_node_type(node, document);
-  current_scopes->push_back(scope);
-  size_t child_count = ts_node_child_count(node);
-  if (child_count > 0) {
-    for (size_t i = 0; i < child_count; i++) {
-      TSNode child = ts_node_child(node, i);
-      append_to_scope_sequence(sequence, current_scopes, child, document, text);
-    }
-  } else {
-    size_t length = ts_node_end_byte(node) - ts_node_start_byte(node);
-    append_text_to_scope_sequence(sequence, current_scopes, text, length);
+  current_scopes->push_back(ts_node_type(node, document));
+
+  for (size_t i = 0, n = ts_node_child_count(node); i < n; i++) {
+    TSNode child = ts_node_child(node, i);
+    append_to_scope_sequence(sequence, current_scopes, child, document, text);
  }
+
+  append_text_to_scope_sequence(
+    sequence, current_scopes, text, ts_node_end_byte(node) - sequence->size()
+  );
+
  current_scopes->pop_back();
 }

--- a/spec/helpers/stream_methods.cc
+++ b/spec/helpers/stream_methods.cc
@ -10,16 +10,7 @@ namespace tree_sitter {

 ostream &operator<<(ostream &stream, const Grammar &grammar) {
  stream << string("#<grammar");
-  stream << string(" rules: {");
-  bool started = false;
-  for (auto pair : grammar.rules) {
-    if (started)
-      stream << string(", ");
-    stream << pair.first;
-    stream << string(" => ");
-    stream << pair.second;
-    started = true;
-  }
+  stream << " rules: " << grammar.rules;
  return stream << string("}>");
 }

@ -85,6 +76,11 @@ ostream &operator<<(ostream &stream, const ParseState &state) {
  return stream << string(">");
 }

+ostream &operator<<(ostream &stream, const ExternalToken &external_token) {
+  return stream << "{" << external_token.name << ", " << external_token.type <<
+    "," << external_token.corresponding_internal_token << "}";
+}
+
 ostream &operator<<(ostream &stream, const ProductionStep &step) {
  stream << "(symbol: " << step.symbol << ", precedence:" << to_string(step.precedence);
  stream << ", associativity: ";
--- a/spec/helpers/stream_methods.h
+++ b/spec/helpers/stream_methods.h
@ -97,6 +97,7 @@ struct AdvanceAction;
 struct AcceptTokenAction;
 class ParseAction;
 class ParseState;
+struct ExternalToken;
 struct ProductionStep;
 struct PrecedenceRange;

@ -110,6 +111,7 @@ ostream &operator<<(ostream &, const AdvanceAction &);
 ostream &operator<<(ostream &, const AcceptTokenAction &);
 ostream &operator<<(ostream &, const ParseAction &);
 ostream &operator<<(ostream &, const ParseState &);
+ostream &operator<<(ostream &, const ExternalToken &);
 ostream &operator<<(ostream &, const ProductionStep &);
 ostream &operator<<(ostream &, const PrecedenceRange &);

--- a/spec/integration/compile_grammar_spec.cc
+++ b/spec/integration/compile_grammar_spec.cc
@ -1,19 +1,11 @@
 #include "spec_helper.h"
 #include "runtime/alloc.h"
 #include "helpers/load_language.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/dedent.h"
 #include "compiler/util/string_helpers.h"
 #include <map>

-static string dedent(string input) {
-  size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n");
-  string whitespace = "\n" + string(indent_level, ' ');
-  util::str_replace(&input, whitespace, "\n");
-  return input.substr(
-    input.find_first_not_of("\n "),
-    input.find_last_not_of("\n ") + 1
-  );
-}
-
 static string fill_template(string input, map<string, string> parameters) {
  string result = input;
  for (const auto &pair : parameters) {
@ -507,6 +499,190 @@ describe("compile_grammar", []() {
    });
  });

+  describe("external scanners", [&]() {
+    it("can tokenize using arbitrary user-defined scanner functions", [&]() {
+      string grammar = R"JSON({
+        "name": "external_scanner_example",
+
+        "externals": [
+          "_percent_string",
+          "_percent_string_start",
+          "_percent_string_end"
+        ],
+
+        "extras": [
+          {"type": "PATTERN", "value": "\\s"}
+        ],
+
+        "rules": {
+          "expression": {
+            "type": "CHOICE",
+            "members": [
+              {"type": "SYMBOL", "name": "string"},
+              {"type": "SYMBOL", "name": "sum"},
+              {"type": "SYMBOL", "name": "identifier"}
+            ]
+          },
+
+          "sum": {
+            "type": "PREC_LEFT",
+            "value": 0,
+            "content": {
+              "type": "SEQ",
+              "members": [
+                {"type": "SYMBOL", "name": "expression"},
+                {"type": "STRING", "value": "+"},
+                {"type": "SYMBOL", "name": "expression"}
+              ]
+            }
+          },
+
+          "string": {
+            "type": "CHOICE",
+            "members": [
+              {"type": "SYMBOL", "name": "_percent_string"},
+              {
+                "type": "SEQ",
+                "members": [
+                  {"type": "SYMBOL", "name": "_percent_string_start"},
+                  {"type": "SYMBOL", "name": "expression"},
+                  {"type": "SYMBOL", "name": "_percent_string_end"}
+                ]
+              },
+            ]
+          },
+
+          "identifier": {
+            "type": "PATTERN",
+            "value": "\\a+"
+          }
+        }
+      })JSON";
+
+      TSCompileResult result = ts_compile_grammar(grammar.c_str());
+      AssertThat(result.error_message, IsNull());
+
+      ts_document_set_language(document, load_compile_result(
+        "external_scanner_example",
+        result,
+        "spec/fixtures/external_scanners/percent_strings.c"
+      ));
+
+      ts_document_set_input_string(document, "x + %(sup (external) scanner?)");
+      ts_document_parse(document);
+      assert_root_node("(expression (sum (expression (identifier)) (expression (string))))");
+
+      ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}");
+      ts_document_parse(document);
+      assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))");
+    });
+
+    it("allows external scanners to refer to tokens that are defined internally", [&]() {
+      string grammar = R"JSON({
+        "name": "shared_external_tokens",
+
+        "externals": [
+          "string",
+          "line_break"
+        ],
+
+        "extras": [
+          {"type": "PATTERN", "value": "\\s"}
+        ],
+
+        "rules": {
+          "statement": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "_expression"},
+              {"type": "SYMBOL", "name": "_expression"},
+              {"type": "SYMBOL", "name": "line_break"}
+            ]
+          },
+
+          "_expression": {
+            "type": "CHOICE",
+            "members": [
+              {"type": "SYMBOL", "name": "string"},
+              {"type": "SYMBOL", "name": "variable"},
+              {"type": "SYMBOL", "name": "number"}
+            ]
+          },
+
+          "variable": {"type": "PATTERN", "value": "\\a+"},
+          "number": {"type": "PATTERN", "value": "\\d+"},
+          "line_break": {"type": "STRING", "value": "\n"}
+        }
+      })JSON";
+
+      TSCompileResult result = ts_compile_grammar(grammar.c_str());
+      AssertThat(result.error_message, IsNull());
+
+      ts_document_set_language(document, load_compile_result(
+        "shared_external_tokens",
+        result,
+        "spec/fixtures/external_scanners/shared_external_tokens.c"
+      ));
+
+      ts_document_set_input_string(document, "a b\n");
+      ts_document_parse(document);
+      assert_root_node("(statement (variable) (variable) (line_break))");
+
+      ts_document_set_input_string(document, "a \nb\n");
+      ts_document_parse(document);
+      assert_root_node("(statement (variable) (variable) (line_break))");
+
+      ts_document_set_input_string(document, "'hello' 'world'\n");
+      ts_document_parse(document);
+      assert_root_node("(statement (string) (string) (line_break))");
+
+      ts_document_set_input_string(document, "'hello' \n'world'\n");
+      ts_document_parse(document);
+      assert_root_node("(statement (string) (string) (line_break))");
+    });
+
+    it("allows external tokens to be used as extras", [&]() {
+      string grammar = R"JSON({
+        "name": "extra_external_tokens",
+
+        "externals": [
+          "comment"
+        ],
+
+        "extras": [
+          {"type": "PATTERN", "value": "\\s"},
+          {"type": "SYMBOL", "name": "comment"}
+        ],
+
+        "rules": {
+          "assignment": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "variable"},
+              {"type": "STRING", "value": "="},
+              {"type": "SYMBOL", "name": "variable"}
+            ]
+          },
+
+          "variable": {"type": "PATTERN", "value": "\\a+"}
+        }
+      })JSON";
+
+      TSCompileResult result = ts_compile_grammar(grammar.c_str());
+      AssertThat(result.error_message, IsNull());
+
+      ts_document_set_language(document, load_compile_result(
+        "extra_external_tokens",
+        result,
+        "spec/fixtures/external_scanners/extra_external_tokens.c"
+      ));
+
+      ts_document_set_input_string(document, "x = # a comment\n y");
+      ts_document_parse(document);
+      assert_root_node("(assignment (variable) (comment) (variable))");
+    });
+  });
+
  describe("when the grammar's start symbol is a token", [&]() {
    it("parses the token", [&]() {
      TSCompileResult result = ts_compile_grammar(R"JSON(
--- a/spec/integration/corpus_specs.cc
+++ b/spec/integration/corpus_specs.cc
@ -84,6 +84,7 @@ describe("The Corpus", []() {
    "json",
    "c",
    "cpp",
+    "python",
  });

  for (auto &language_name : test_languages) {
--- a/spec/runtime/document_spec.cc
+++ b/spec/runtime/document_spec.cc
@ -5,6 +5,7 @@
 #include "helpers/tree_helpers.h"
 #include "helpers/point_helpers.h"
 #include "helpers/spy_logger.h"
+#include "helpers/stderr_logger.h"
 #include "helpers/spy_input.h"
 #include "helpers/load_language.h"

@ -15,22 +16,22 @@ TSPoint point(size_t row, size_t column) {
 START_TEST

 describe("Document", [&]() {
-  TSDocument *doc;
+  TSDocument *document;
  TSNode root;

  before_each([&]() {
    record_alloc::start();
-    doc = ts_document_new();
+    document = ts_document_new();
  });

  after_each([&]() {
-    ts_document_free(doc);
+    ts_document_free(document);
    record_alloc::stop();
    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
  });

  auto assert_node_string_equals = [&](TSNode node, const string &expected) {
-    char *str = ts_node_string(node, doc);
+    char *str = ts_node_string(node, document);
    string actual(str);
    ts_free(str);
    AssertThat(actual, Equals(expected));
@ -42,11 +43,11 @@ describe("Document", [&]() {
    before_each([&]() {
      spy_input = new SpyInput("{\"key\": [null, 2]}", 3);

-      ts_document_set_language(doc, get_test_language("json"));
-      ts_document_set_input_string(doc, "{\"key\": [1, 2]}");
-      ts_document_parse(doc);
+      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_input_string(document, "{\"key\": [1, 2]}");
+      ts_document_parse(document);

-      root = ts_document_root_node(doc);
+      root = ts_document_root_node(document);
      assert_node_string_equals(
        root,
        "(object (pair (string) (array (number) (number))))");
@ -61,11 +62,11 @@ describe("Document", [&]() {
      spy_input->content = string((const char *)content, sizeof(content));
      spy_input->encoding = TSInputEncodingUTF16;

-      ts_document_set_input(doc, spy_input->input());
-      ts_document_invalidate(doc);
-      ts_document_parse(doc);
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);

-      root = ts_document_root_node(doc);
+      root = ts_document_root_node(document);
      assert_node_string_equals(
        root,
        "(array (true) (false))");
@ -77,27 +78,27 @@ describe("Document", [&]() {
      spy_input->encoding = TSInputEncodingUTF16;
      // spy_input->measure_columns_in_bytes

-      ts_document_set_input(doc, spy_input->input());
-      ts_document_invalidate(doc);
-      ts_document_parse(doc);
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
    });

    it("allows the input to be retrieved later", [&]() {
-      ts_document_set_input(doc, spy_input->input());
-      AssertThat(ts_document_input(doc).payload, Equals<void *>(spy_input));
-      AssertThat(ts_document_input(doc).read, Equals(spy_input->input().read));
-      AssertThat(ts_document_input(doc).seek, Equals(spy_input->input().seek));
+      ts_document_set_input(document, spy_input->input());
+      AssertThat(ts_document_input(document).payload, Equals<void *>(spy_input));
+      AssertThat(ts_document_input(document).read, Equals(spy_input->input().read));
+      AssertThat(ts_document_input(document).seek, Equals(spy_input->input().seek));
    });

    it("does not assume that the document's text has changed", [&]() {
-      ts_document_set_input(doc, spy_input->input());
-      AssertThat(ts_document_root_node(doc), Equals<TSNode>(root));
+      ts_document_set_input(document, spy_input->input());
+      AssertThat(ts_document_root_node(document), Equals<TSNode>(root));
      AssertThat(ts_node_has_changes(root), IsFalse());
      AssertThat(spy_input->strings_read, Equals(vector<string>({ "" })));
    });

    it("reads text from the new input for future parses", [&]() {
-      ts_document_set_input(doc, spy_input->input());
+      ts_document_set_input(document, spy_input->input());

      // Insert 'null', delete '1'.
      TSInputEdit edit = {};
@ -105,28 +106,28 @@ describe("Document", [&]() {
      edit.extent_added.column = edit.bytes_added = 4;
      edit.extent_removed.column = edit.bytes_removed = 1;

-      ts_document_edit(doc, edit);
-      ts_document_parse(doc);
+      ts_document_edit(document, edit);
+      ts_document_parse(document);

-      TSNode new_root = ts_document_root_node(doc);
+      TSNode new_root = ts_document_root_node(document);
      assert_node_string_equals(
        new_root,
        "(object (pair (string) (array (null) (number))))");
-      AssertThat(spy_input->strings_read, Equals(vector<string>({" [null, 2"})));
+      AssertThat(spy_input->strings_read, Equals(vector<string>({" [null, 2" })));
    });

    it("reads from the new input correctly when the old input was blank", [&]() {
-      ts_document_set_input_string(doc, "");
-      ts_document_parse(doc);
-      TSNode new_root = ts_document_root_node(doc);
+      ts_document_set_input_string(document, "");
+      ts_document_parse(document);
+      TSNode new_root = ts_document_root_node(document);
      AssertThat(ts_node_end_char(new_root), Equals<size_t>(0));
      assert_node_string_equals(
        new_root,
        "(ERROR)");

-      ts_document_set_input_string(doc, "1");
-      ts_document_parse(doc);
-      new_root = ts_document_root_node(doc);
+      ts_document_set_input_string(document, "1");
+      ts_document_parse(document);
+      new_root = ts_document_root_node(document);
      AssertThat(ts_node_end_char(new_root), Equals<size_t>(1));
      assert_node_string_equals(
        new_root,
@ -136,33 +137,44 @@ describe("Document", [&]() {

  describe("set_language(language)", [&]() {
    before_each([&]() {
-      ts_document_set_input_string(doc, "{\"key\": [1, 2]}\n");
+      ts_document_set_input_string(document, "{\"key\": [1, 2]}\n");
    });

    it("uses the given language for future parses", [&]() {
-      ts_document_set_language(doc, get_test_language("json"));
-      ts_document_parse(doc);
+      ts_document_set_language(document, get_test_language("json"));
+      ts_document_parse(document);

-      root = ts_document_root_node(doc);
+      root = ts_document_root_node(document);
      assert_node_string_equals(
        root,
        "(object (pair (string) (array (number) (number))))");
    });

    it("clears out any previous tree", [&]() {
-      ts_document_set_language(doc, get_test_language("json"));
-      ts_document_parse(doc);
+      ts_document_set_language(document, get_test_language("json"));
+      ts_document_parse(document);

-      ts_document_set_language(doc, get_test_language("javascript"));
-      AssertThat(ts_document_root_node(doc).data, Equals<void *>(nullptr));
+      ts_document_set_language(document, get_test_language("javascript"));
+      AssertThat(ts_document_root_node(document).data, Equals<void *>(nullptr));

-      ts_document_parse(doc);
-      root = ts_document_root_node(doc);
+      ts_document_parse(document);
+      root = ts_document_root_node(document);
      assert_node_string_equals(
        root,
        "(program (expression_statement "
          "(object (pair (string) (array (number) (number))))))");
    });
+
+    it("does not allow setting a language with a different version number", [&]() {
+      TSLanguage language = *get_test_language("json");
+      AssertThat(ts_language_version(&language), Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));
+
+      language.version++;
+      AssertThat(ts_language_version(&language), !Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));
+
+      ts_document_set_language(document, &language);
+      AssertThat(ts_document_language(document), IsNull());
+    });
  });

  describe("set_logger(TSLogger)", [&]() {
@ -170,45 +182,39 @@ describe("Document", [&]() {

    before_each([&]() {
      logger = new SpyLogger();
-      ts_document_set_language(doc, get_test_language("json"));
-      ts_document_set_input_string(doc, "[1, 2]");
+      ts_document_set_language(document, get_test_language("json"));
+      ts_document_set_input_string(document, "[1, 2]");
    });

    after_each([&]() {
      delete logger;
    });

-    it("calls the debugger with a message for each lex action", [&]() {
-      ts_document_set_logger(doc, logger->logger());
-      ts_document_parse(doc);
-
-      AssertThat(logger->messages, Contains("lookahead char:'1'"));
-      AssertThat(logger->messages, Contains("lookahead char:'['"));
-    });
-
    it("calls the debugger with a message for each parse action", [&]() {
-      ts_document_set_logger(doc, logger->logger());
-      ts_document_parse(doc);
+      ts_document_set_logger(document, logger->logger());
+      ts_document_parse(document);

      AssertThat(logger->messages, Contains("new_parse"));
-      AssertThat(logger->messages, Contains("lookahead char:'['"));
+      AssertThat(logger->messages, Contains("skip character:' '"));
+      AssertThat(logger->messages, Contains("consume character:'['"));
+      AssertThat(logger->messages, Contains("consume character:'1'"));
      AssertThat(logger->messages, Contains("reduce sym:array, child_count:4"));
      AssertThat(logger->messages, Contains("accept"));
    });

    it("allows the debugger to be retrieved later", [&]() {
-      ts_document_set_logger(doc, logger->logger());
-      AssertThat(ts_document_logger(doc).payload, Equals(logger));
+      ts_document_set_logger(document, logger->logger());
+      AssertThat(ts_document_logger(document).payload, Equals(logger));
    });

    describe("disabling debugging", [&]() {
      before_each([&]() {
-        ts_document_set_logger(doc, logger->logger());
-        ts_document_set_logger(doc, {NULL, NULL});
+        ts_document_set_logger(document, logger->logger());
+        ts_document_set_logger(document, {NULL, NULL});
      });

      it("does not call the debugger any more", [&]() {
-        ts_document_parse(doc);
+        ts_document_parse(document);
        AssertThat(logger->messages, IsEmpty());
      });
    });
@ -218,12 +224,12 @@ describe("Document", [&]() {
    SpyInput *input;

    before_each([&]() {
-      ts_document_set_language(doc, get_test_language("javascript"));
+      ts_document_set_language(document, get_test_language("javascript"));
      input = new SpyInput("{a: null};", 3);
-      ts_document_set_input(doc, input->input());
-      ts_document_parse(doc);
+      ts_document_set_input(document, input->input());
+      ts_document_parse(document);
      assert_node_string_equals(
-        ts_document_root_node(doc),
+        ts_document_root_node(document),
        "(program (expression_statement (object (pair (identifier) (null)))))");
    });

@ -231,26 +237,25 @@ describe("Document", [&]() {
      delete input;
    });

-    auto get_ranges = [&](std::function<TSInputEdit()> callback) -> vector<TSRange> {
+    auto get_invalidated_ranges_for_edit = [&](std::function<TSInputEdit()> callback) -> vector<TSRange> {
      TSInputEdit edit = callback();
-      ts_document_edit(doc, edit);
+      ts_document_edit(document, edit);

      TSRange *ranges;
      uint32_t range_count = 0;
-
-      ts_document_parse_and_get_changed_ranges(doc, &ranges, &range_count);
+      ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);

      vector<TSRange> result;
-      for (size_t i = 0; i < range_count; i++)
+      for (size_t i = 0; i < range_count; i++) {
        result.push_back(ranges[i]);
+      }
      ts_free(ranges);
-
      return result;
    };

    it("reports changes when one token has been updated", [&]() {
      // Replace `null` with `nothing`
-      auto ranges = get_ranges([&]() {
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
        return input->replace(input->content.find("ull"), 1, "othing");
      });

@ -262,7 +267,7 @@ describe("Document", [&]() {
      })));

      // Replace `nothing` with `null` again
-      ranges = get_ranges([&]() {
+      ranges = get_invalidated_ranges_for_edit([&]() {
        return input->undo();
      });

@ -276,7 +281,7 @@ describe("Document", [&]() {

    it("reports changes when tokens have been appended", [&]() {
      // Add a second key-value pair
-      auto ranges = get_ranges([&]() {
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
        return input->replace(input->content.find("}"), 0, ", b: false");
      });

@ -288,12 +293,12 @@ describe("Document", [&]() {
      })));

      // Add a third key-value pair in between the first two
-      ranges = get_ranges([&]() {
+      ranges = get_invalidated_ranges_for_edit([&]() {
        return input->replace(input->content.find(", b"), 0, ", c: 1");
      });

      assert_node_string_equals(
-        ts_document_root_node(doc),
+        ts_document_root_node(document),
        "(program (expression_statement (object "
          "(pair (identifier) (null)) "
          "(pair (identifier) (number)) "
@ -307,41 +312,39 @@ describe("Document", [&]() {
      })));

      // Delete the middle pair.
-      ranges = get_ranges([&]() {
+      ranges = get_invalidated_ranges_for_edit([&]() {
        return input->undo();
      });

      assert_node_string_equals(
-        ts_document_root_node(doc),
+        ts_document_root_node(document),
        "(program (expression_statement (object "
          "(pair (identifier) (null)) "
          "(pair (identifier) (false)))))");

-      AssertThat(ranges, Equals(vector<TSRange>({
-      })));
+      AssertThat(ranges, IsEmpty());

      // Delete the second pair.
-      ranges = get_ranges([&]() {
+      ranges = get_invalidated_ranges_for_edit([&]() {
        return input->undo();
      });

      assert_node_string_equals(
-        ts_document_root_node(doc),
+        ts_document_root_node(document),
        "(program (expression_statement (object "
          "(pair (identifier) (null)))))");

-      AssertThat(ranges, Equals(vector<TSRange>({
-      })));
+      AssertThat(ranges, IsEmpty());
    });

    it("reports changes when trees have been wrapped", [&]() {
      // Wrap the object in an assignment expression.
-      auto ranges = get_ranges([&]() {
+      auto ranges = get_invalidated_ranges_for_edit([&]() {
        return input->replace(input->content.find("null"), 0, "b === ");
      });

      assert_node_string_equals(
-        ts_document_root_node(doc),
+        ts_document_root_node(document),
        "(program (expression_statement (object "
          "(pair (identifier) (rel_op (identifier) (null))))))");

--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@ -4,11 +4,13 @@
 #include "helpers/spy_input.h"
 #include "helpers/load_language.h"
 #include "helpers/record_alloc.h"
+#include "helpers/stderr_logger.h"
+#include "helpers/dedent.h"

 START_TEST

 describe("Parser", [&]() {
-  TSDocument *doc;
+  TSDocument *document;
  SpyInput *input;
  TSNode root;
  size_t chunk_size;
@ -18,90 +20,76 @@ describe("Parser", [&]() {

    chunk_size = 3;
    input = nullptr;
-
-    doc = ts_document_new();
+    document = ts_document_new();
  });

  after_each([&]() {
-    if (doc)
-      ts_document_free(doc);
-
-    if (input)
-      delete input;
+    if (document) ts_document_free(document);
+    if (input) delete input;

    record_alloc::stop();
    AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
  });

-  auto set_text = [&](const char *text) {
+  auto set_text = [&](string text) {
    input = new SpyInput(text, chunk_size);
-    ts_document_set_input(doc, input->input());
-    ts_document_parse(doc);
+    ts_document_set_input(document, input->input());
+    ts_document_parse(document);

-    root = ts_document_root_node(doc);
-    AssertThat(ts_node_end_byte(root), Equals(strlen(text)));
+    root = ts_document_root_node(document);
+    AssertThat(ts_node_end_byte(root), Equals(text.size()));
    input->clear();
  };

-  auto insert_text = [&](size_t position, string text) {
-    size_t prev_size = ts_node_end_byte(root);
-    ts_document_edit(doc, input->replace(position, 0, text));
-    ts_document_parse(doc);
-
-    root = ts_document_root_node(doc);
-    size_t new_size = ts_node_end_byte(root);
-    AssertThat(new_size, Equals(prev_size + text.size()));
-  };
-
-  auto delete_text = [&](size_t position, size_t length) {
-    size_t prev_size = ts_node_end_byte(root);
-    ts_document_edit(doc, input->replace(position, length, ""));
-    ts_document_parse(doc);
-
-    root = ts_document_root_node(doc);
-    size_t new_size = ts_node_end_byte(root);
-    AssertThat(new_size, Equals(prev_size - length));
-  };
-
  auto replace_text = [&](size_t position, size_t length, string new_text) {
    size_t prev_size = ts_node_end_byte(root);

-    ts_document_edit(doc, input->replace(position, length, new_text));
-    ts_document_parse(doc);
+    ts_document_edit(document, input->replace(position, length, new_text));
+    ts_document_parse(document);

-    root = ts_document_root_node(doc);
+    root = ts_document_root_node(document);
    size_t new_size = ts_node_end_byte(root);
    AssertThat(new_size, Equals(prev_size - length + new_text.size()));
  };

+  auto insert_text = [&](size_t position, string text) {
+    replace_text(position, 0, text);
+  };
+
+  auto delete_text = [&](size_t position, size_t length) {
+    replace_text(position, length, "");
+  };
+
+  auto undo = [&]() {
+    ts_document_edit(document, input->undo());
+    ts_document_parse(document);
+  };
+
  auto assert_root_node = [&](const string &expected) {
-    TSNode node = ts_document_root_node(doc);
-    char *str = ts_node_string(node, doc);
-    string actual(str);
-    ts_free(str);
+    TSNode node = ts_document_root_node(document);
+    char *node_string = ts_node_string(node, document);
+    string actual(node_string);
+    ts_free(node_string);
    AssertThat(actual, Equals(expected));
  };

+  auto get_node_text = [&](TSNode node) {
+    size_t start = ts_node_start_byte(node);
+    size_t end = ts_node_end_byte(node);
+    return input->content.substr(start, end - start);
+  };
+
  describe("handling errors", [&]() {
-    before_each([&]() {
-      ts_document_set_language(doc, get_test_language("json"));
-    });
-
-    auto get_node_text = [&](TSNode node) {
-      size_t start = ts_node_start_byte(node);
-      size_t end = ts_node_end_byte(node);
-      return input->content.substr(start, end - start);
-    };
-
    describe("when there is an invalid substring right before a valid token", [&]() {
      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, get_test_language("json"));
        set_text("  [123,  @@@@@,   true]");

        assert_root_node(
          "(array (number) (ERROR (UNEXPECTED '@')) (true))");

        TSNode error = ts_node_named_child(root, 1);
-        AssertThat(ts_node_type(error, doc), Equals("ERROR"));
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
        AssertThat(get_node_text(error), Equals(",  @@@@@"));
        AssertThat(ts_node_child_count(error), Equals<size_t>(2));

@ -112,56 +100,59 @@ describe("Parser", [&]() {
        AssertThat(get_node_text(garbage), Equals("@@@@@"));

        TSNode node_after_error = ts_node_named_child(root, 2);
-        AssertThat(ts_node_type(node_after_error, doc), Equals("true"));
+        AssertThat(ts_node_type(node_after_error, document), Equals("true"));
        AssertThat(get_node_text(node_after_error), Equals("true"));
      });
    });

    describe("when there is an unexpected string in the middle of a token", [&]() {
      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, get_test_language("json"));
        set_text("  [123, faaaaalse, true]");

        assert_root_node(
          "(array (number) (ERROR (UNEXPECTED 'a')) (true))");

        TSNode error = ts_node_named_child(root, 1);
-        AssertThat(ts_node_type(error, doc), Equals("ERROR"));
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
        AssertThat(ts_node_child_count(error), Equals<size_t>(2));

        TSNode comma = ts_node_child(error, 0);
-        AssertThat(ts_node_type(comma, doc), Equals(","));
+        AssertThat(ts_node_type(comma, document), Equals(","));
        AssertThat(get_node_text(comma), Equals(","));

        TSNode garbage = ts_node_child(error, 1);
-        AssertThat(ts_node_type(garbage, doc), Equals("ERROR"));
+        AssertThat(ts_node_type(garbage, document), Equals("ERROR"));
        AssertThat(get_node_text(garbage), Equals("faaaaalse"));

        TSNode last = ts_node_named_child(root, 2);
-        AssertThat(ts_node_type(last, doc), Equals("true"));
+        AssertThat(ts_node_type(last, document), Equals("true"));
        AssertThat(ts_node_start_byte(last), Equals(strlen("  [123, faaaaalse, ")));
      });
    });

    describe("when there is one unexpected token between two valid tokens", [&]() {
      it("computes the error node's size and position correctly", [&]() {
+        ts_document_set_language(document, get_test_language("json"));
        set_text("  [123, true false, true]");

        assert_root_node(
          "(array (number) (true) (ERROR (false)) (true))");

        TSNode error = ts_node_named_child(root, 2);
-        AssertThat(ts_node_type(error, doc), Equals("ERROR"));
+        AssertThat(ts_node_type(error, document), Equals("ERROR"));
        AssertThat(get_node_text(error), Equals("false"));
        AssertThat(ts_node_child_count(error), Equals<size_t>(1));

        TSNode last = ts_node_named_child(root, 1);
-        AssertThat(ts_node_type(last, doc), Equals("true"));
+        AssertThat(ts_node_type(last, document), Equals("true"));
        AssertThat(get_node_text(last), Equals("true"));
      });
    });

    describe("when there is an unexpected string at the end of a token", [&]() {
      it("computes the error's size and position correctly", [&]() {
+        ts_document_set_language(document, get_test_language("json"));
        set_text("  [123, \"hi\n, true]");

        assert_root_node(
@ -171,7 +162,7 @@ describe("Parser", [&]() {

    describe("when there is an unterminated error", [&]() {
      it("maintains a consistent tree", [&]() {
-        ts_document_set_language(doc, get_test_language("javascript"));
+        ts_document_set_language(document, get_test_language("javascript"));
        set_text("a; /* b");
        assert_root_node(
          "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
@ -180,14 +171,9 @@ describe("Parser", [&]() {
  });

  describe("handling extra tokens", [&]() {
-    // In the javascript example grammar, ASI works by using newlines as
-    // terminators in statements, but also as extra tokens.
-    before_each([&]() {
-      ts_document_set_language(doc, get_test_language("javascript"));
-    });
-
    describe("when the token appears as part of a grammar rule", [&]() {
-      it("is incorporated into the tree", [&]() {
+      it("incorporates it into the tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
        set_text("fn()\n");

        assert_root_node(
@ -196,7 +182,8 @@ describe("Parser", [&]() {
    });

    describe("when the token appears somewhere else", [&]() {
-      it("is incorporated into the tree", [&]() {
+      it("incorporates it into the tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
        set_text(
          "fn()\n"
          "  .otherFn();");
@ -211,7 +198,8 @@ describe("Parser", [&]() {
    });

    describe("when several extra tokens appear in a row", [&]() {
-      it("is incorporated into the tree", [&]() {
+      it("incorporates them into the tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
        set_text(
          "fn()\n\n"
          "// This is a comment"
@ -230,199 +218,219 @@ describe("Parser", [&]() {
  });

  describe("editing", [&]() {
-    before_each([&]() {
-      ts_document_set_language(doc, get_test_language("javascript"));
+    describe("creating new tokens near the end of the input", [&]() {
+      it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("x * (100 + abc);");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(identifier) "
+            "(math_op (number) (identifier)))))");
+
+        insert_text(strlen("x * (100 + abc"), ".d");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(identifier) "
+            "(math_op (number) (member_access (identifier) (identifier))))))");
+
+        AssertThat(input->strings_read, Equals(vector<string>({ " + abc.d)" })));
+      });
    });

-    describe("inserting text", [&]() {
-      describe("creating new tokens near the end of the input", [&]() {
-        it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
-          set_text("x * (100 + abc);");
+    describe("creating new tokens near the beginning of the input", [&]() {
+      it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
+        chunk_size = 2;

-          assert_root_node(
-            "(program (expression_statement (math_op "
-              "(identifier) "
-              "(math_op (number) (identifier)))))");
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("123 + 456 * (10 + x);");

-          insert_text(strlen("x * (100 + abc"), ".d");
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(math_op (number) (math_op (number) (identifier))))))");

-          assert_root_node(
-            "(program (expression_statement (math_op "
-              "(identifier) "
-              "(math_op (number) (member_access (identifier) (identifier))))))");
+        insert_text(strlen("123"), " || 5");

-          AssertThat(input->strings_read, Equals(vector<string>({ " + abc.d)" })));
-        });
-      });
-
-      describe("creating new tokens near the beginning of the input", [&]() {
-        it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
-          chunk_size = 2;
-
-          set_text("123 + 456 * (10 + x);");
-
-          assert_root_node(
-            "(program (expression_statement (math_op "
+        assert_root_node(
+          "(program (expression_statement (bool_op "
+            "(number) "
+            "(math_op "
              "(number) "
-              "(math_op (number) (math_op (number) (identifier))))))");
+              "(math_op (number) (math_op (number) (identifier)))))))");

-          insert_text(strlen("123"), " || 5");
-
-          assert_root_node(
-            "(program (expression_statement (bool_op "
-              "(number) "
-              "(math_op "
-                "(number) "
-                "(math_op (number) (math_op (number) (identifier)))))))");
-
-          AssertThat(input->strings_read, Equals(vector<string>({ "123 || 5 +" })));
-        });
+        AssertThat(input->strings_read, Equals(vector<string>({ "123 || 5 +" })));
      });
+    });

-      describe("introducing an error", [&]() {
-        it("gives the error the right size", [&]() {
-          ts_document_set_language(doc, get_test_language("javascript"));
+    describe("introducing an error", [&]() {
+      it("gives the error the right size", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("var x = y;");

-          set_text("var x = y;");
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (identifier))))");

-          assert_root_node(
-            "(program (var_declaration (var_assignment "
-              "(identifier) (identifier))))");
+        insert_text(strlen("var x = y"), " *");

-          insert_text(strlen("var x = y"), " *");
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (identifier)) (ERROR)))");

-          assert_root_node(
-            "(program (var_declaration (var_assignment "
-              "(identifier) (identifier)) (ERROR)))");
+        insert_text(strlen("var x = y *"), " z");

-          insert_text(strlen("var x = y *"), " z");
-
-          assert_root_node(
-            "(program (var_declaration (var_assignment "
-              "(identifier) (math_op (identifier) (identifier)))))");
-        });
+        assert_root_node(
+          "(program (var_declaration (var_assignment "
+            "(identifier) (math_op (identifier) (identifier)))))");
      });
+    });

-      describe("into the middle of an existing token", [&]() {
-        it("updates the parse tree", [&]() {
-          set_text("abc * 123;");
+    describe("into the middle of an existing token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("abc * 123;");

-          assert_root_node(
-            "(program (expression_statement (math_op (identifier) (number))))");
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");

-          insert_text(strlen("ab"), "XYZ");
+        insert_text(strlen("ab"), "XYZ");

-          assert_root_node(
-            "(program (expression_statement (math_op (identifier) (number))))");
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");

-          TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
-          AssertThat(ts_node_type(node, doc), Equals("identifier"));
-          AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc")));
-        });
+        TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
+        AssertThat(ts_node_type(node, document), Equals("identifier"));
+        AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc")));
      });
+    });

-      describe("at the end of an existing token", [&]() {
-        it("updates the parse tree", [&]() {
-          set_text("abc * 123;");
+    describe("at the end of an existing token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("abc * 123;");

-          assert_root_node(
-            "(program (expression_statement (math_op (identifier) (number))))");
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");

-          insert_text(strlen("abc"), "XYZ");
+        insert_text(strlen("abc"), "XYZ");

-          assert_root_node(
-            "(program (expression_statement (math_op (identifier) (number))))");
+        assert_root_node(
+          "(program (expression_statement (math_op (identifier) (number))))");

-          TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
-          AssertThat(ts_node_type(node, doc), Equals("identifier"));
-          AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ")));
-        });
+        TSNode node = ts_node_named_descendant_for_char_range(root, 1, 1);
+        AssertThat(ts_node_type(node, document), Equals("identifier"));
+        AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ")));
      });
+    });

-      describe("into a node containing a extra token", [&]() {
-        it("updates the parse tree", [&]() {
-          set_text("123 *\n"
+    describe("inserting text into a node containing a extra token", [&]() {
+      it("updates the parse tree", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("123 *\n"
+          "// a-comment\n"
+          "abc;");
+
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(comment) "
+            "(identifier))))");
+
+        insert_text(
+          strlen("123 *\n"
            "// a-comment\n"
-            "abc;");
+            "abc"),
+          "XYZ");

-          assert_root_node(
-            "(program (expression_statement (math_op "
-              "(number) "
-              "(comment) "
-              "(identifier))))");
-
-          insert_text(
-            strlen("123 *\n"
-              "// a-comment\n"
-              "abc"),
-            "XYZ");
-
-          assert_root_node(
-            "(program (expression_statement (math_op "
-              "(number) "
-              "(comment) "
-              "(identifier))))");
-        });
+        assert_root_node(
+          "(program (expression_statement (math_op "
+            "(number) "
+            "(comment) "
+            "(identifier))))");
      });
    });

-    describe("deleting text", [&]() {
-      describe("when a critical token is removed", [&]() {
-        it("updates the parse tree, creating an error", [&]() {
-          set_text("123 * 456; 789 * 123;");
+    describe("when a critical token is removed", [&]() {
+      it("updates the parse tree, creating an error", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
+        set_text("123 * 456; 789 * 123;");

-          assert_root_node(
-            "(program "
-              "(expression_statement (math_op (number) (number))) "
-              "(expression_statement (math_op (number) (number))))");
+        assert_root_node(
+          "(program "
+            "(expression_statement (math_op (number) (number))) "
+            "(expression_statement (math_op (number) (number))))");

-          delete_text(strlen("123 "), 2);
+        delete_text(strlen("123 "), 2);

-          assert_root_node(
-            "(program "
-              "(expression_statement (number) (ERROR (number))) "
-              "(expression_statement (math_op (number) (number))))");
-        });
+        assert_root_node(
+          "(program "
+            "(expression_statement (number) (ERROR (number))) "
+            "(expression_statement (math_op (number) (number))))");
      });
    });

-    describe("replacing text", [&]() {
-      it("does not try to re-use nodes that are within the edited region", [&]() {
-        ts_document_set_language(doc, get_test_language("javascript"));
+    describe("with external tokens", [&]() {
+      it("maintains the external scanner's state during incremental parsing", [&]() {
+        ts_document_set_language(document, get_test_language("python"));
+        string text = dedent(R"PYTHON(
+          if a:
+              print b
+          return c
+        )PYTHON");

-        set_text("{ x: (b.c) };");
+        set_text(text);
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier))) "
+          "(return_statement (expression_list (identifier))))");

-        assert_root_node(
-          "(program (expression_statement (object (pair "
-            "(identifier) (member_access (identifier) (identifier))))))");
+        replace_text(text.find("return"), 0, "    ");
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier)) "
+            "(return_statement (expression_list (identifier)))))");

-        replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c");
-
-        assert_root_node(
-          "(program (expression_statement (object (pair "
-            "(identifier) (member_access (identifier) (identifier))))))");
+        undo();
+        assert_root_node("(module "
+          "(if_statement (identifier) "
+            "(print_statement (identifier))) "
+          "(return_statement (expression_list (identifier))))");
      });
    });

+    it("does not try to re-use nodes that are within the edited region", [&]() {
+      ts_document_set_language(document, get_test_language("javascript"));
+      set_text("{ x: (b.c) };");
+
+      assert_root_node(
+        "(program (expression_statement (object (pair "
+          "(identifier) (member_access (identifier) (identifier))))))");
+
+      replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c");
+
+      assert_root_node(
+        "(program (expression_statement (object (pair "
+          "(identifier) (member_access (identifier) (identifier))))))");
+    });
+
    it("updates the document's parse count", [&]() {
-      ts_document_set_language(doc, get_test_language("javascript"));
-      AssertThat(ts_document_parse_count(doc), Equals<size_t>(0));
+      ts_document_set_language(document, get_test_language("javascript"));
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(0));

      set_text("{ x: (b.c) };");
-      AssertThat(ts_document_parse_count(doc), Equals<size_t>(1));
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(1));

      insert_text(strlen("{ x"), "yz");
-      AssertThat(ts_document_parse_count(doc), Equals<size_t>(2));
+      AssertThat(ts_document_parse_count(document), Equals<size_t>(2));
    });
  });

  describe("lexing", [&]() {
-    before_each([&]() {
-      ts_document_set_language(doc, get_test_language("javascript"));
-    });
-
    describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
      it("terminates them at the end of the document", [&]() {
+        ts_document_set_language(document, get_test_language("javascript"));
        set_text("x; // this is a comment");

        assert_root_node(
@ -437,6 +445,7 @@ describe("Parser", [&]() {

    it("recognizes UTF8 characters as single characters", [&]() {
      // 'ΩΩΩ — ΔΔ';
+      ts_document_set_language(document, get_test_language("javascript"));
      set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';");

      assert_root_node(
--- a/spec/runtime/stack_spec.cc
+++ b/spec/runtime/stack_spec.cc
@ -521,6 +521,31 @@ describe("Stack", [&]() {
      free_slice_array(&pop.slices);
    });
  });
+
+  describe("setting external token state", [&]() {
+    TSExternalTokenState external_token_state1, external_token_state2;
+
+    it("allows the state to be retrieved", [&]() {
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(nullptr));
+
+      ts_stack_set_external_token_state(stack, 0, &external_token_state1);
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1));
+
+      ts_stack_copy_version(stack, 0);
+      AssertThat(ts_stack_external_token_state(stack, 0), Equals(&external_token_state1));
+    });
+
+    it("does not merge stack versions with different external token states", [&]() {
+      ts_stack_copy_version(stack, 0);
+      ts_stack_push(stack, 0, trees[0], false, 5);
+      ts_stack_push(stack, 1, trees[0], false, 5);
+
+      ts_stack_set_external_token_state(stack, 0, &external_token_state1);
+      ts_stack_set_external_token_state(stack, 0, &external_token_state2);
+
+      AssertThat(ts_stack_merge(stack, 0, 1), IsFalse());
+    });
+  });
 });

 END_TEST
--- a/spec/runtime/tree_spec.cc
+++ b/spec/runtime/tree_spec.cc
@ -22,47 +22,32 @@ void assert_consistent(const Tree *tree) {

 START_TEST

-enum {
-  cat = 1,
-  dog,
-  eel,
-  fox,
-  goat,
-  hog,
-};
-
 describe("Tree", []() {
-  Tree *tree1, *tree2, *parent1;
+  enum {
+    symbol1 = 1,
+    symbol2,
+    symbol3,
+    symbol4,
+    symbol5,
+    symbol6,
+    symbol7,
+    symbol8,
+    symbol9,
+  };
+
  TSSymbolMetadata visible = {true, true, false, true};
  TSSymbolMetadata invisible = {false, false, false, true};

-  before_each([&]() {
-    tree1 = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible);
-    tree2 = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible);
-
-    ts_tree_retain(tree1);
-    ts_tree_retain(tree2);
-    parent1 = ts_tree_make_node(dog, 2, tree_array({
-      tree1,
-      tree2,
-    }), visible);
-  });
-
-  after_each([&]() {
-    ts_tree_release(tree1);
-    ts_tree_release(tree2);
-    ts_tree_release(parent1);
-  });
-
-  describe("make_leaf(sym, size, padding, is_hidden)", [&]() {
-    it("does not record that it is fragile", [&]() {
-      AssertThat(tree1->fragile_left, IsFalse());
-      AssertThat(tree1->fragile_right, IsFalse());
+  describe("make_leaf", [&]() {
+    it("does not mark the tree as fragile", [&]() {
+      Tree *tree = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible);
+      AssertThat(tree->fragile_left, IsFalse());
+      AssertThat(tree->fragile_right, IsFalse());
    });
  });

-  describe("make_error(size, padding, lookahead_char)", [&]() {
-    it("records that it is fragile", [&]() {
+  describe("make_error", [&]() {
+    it("marks the tree as fragile", [&]() {
      Tree *error_tree = ts_tree_make_error(
        length_zero(),
        length_zero(),
@ -75,15 +60,33 @@ describe("Tree", []() {
    });
  });

-  describe("make_node(symbol, child_count, children, is_hidden)", [&]() {
-    it("computes its size based on its child nodes", [&]() {
-      AssertThat(parent1->size.bytes, Equals<size_t>(
-        tree1->size.bytes + + tree2->padding.bytes + tree2->size.bytes));
-      AssertThat(parent1->size.chars, Equals<size_t>(
-        tree1->size.chars + + tree2->padding.chars + tree2->size.chars));
+  describe("make_node", [&]() {
+    Tree *tree1, *tree2, *parent1;
+
+    before_each([&]() {
+      tree1 = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible);
+      tree2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible);
+
+      ts_tree_retain(tree1);
+      ts_tree_retain(tree2);
+      parent1 = ts_tree_make_node(symbol3, 2, tree_array({
+        tree1,
+        tree2,
+      }), visible);
    });

-    it("computes its padding based on its first child", [&]() {
+    after_each([&]() {
+      ts_tree_release(tree1);
+      ts_tree_release(tree2);
+      ts_tree_release(parent1);
+    });
+
+    it("computes its size and padding based on its child nodes", [&]() {
+      AssertThat(parent1->size.bytes, Equals<size_t>(
+        tree1->size.bytes + tree2->padding.bytes + tree2->size.bytes));
+      AssertThat(parent1->size.chars, Equals<size_t>(
+        tree1->size.chars + tree2->padding.chars + tree2->size.chars));
+
      AssertThat(parent1->padding.bytes, Equals<size_t>(tree1->padding.bytes));
      AssertThat(parent1->padding.chars, Equals<size_t>(tree1->padding.chars));
    });
@ -97,7 +100,7 @@ describe("Tree", []() {

        ts_tree_retain(tree1);
        ts_tree_retain(tree2);
-        parent = ts_tree_make_node(eel, 2, tree_array({
+        parent = ts_tree_make_node(symbol3, 2, tree_array({
          tree1,
          tree2,
        }), visible);
@ -121,7 +124,7 @@ describe("Tree", []() {

        ts_tree_retain(tree1);
        ts_tree_retain(tree2);
-        parent = ts_tree_make_node(eel, 2, tree_array({
+        parent = ts_tree_make_node(symbol3, 2, tree_array({
          tree1,
          tree2,
        }), visible);
@ -145,7 +148,7 @@ describe("Tree", []() {

        ts_tree_retain(tree1);
        ts_tree_retain(tree2);
-        parent = ts_tree_make_node(eel, 2, tree_array({
+        parent = ts_tree_make_node(symbol3, 2, tree_array({
          tree1,
          tree2,
        }), visible);
@ -162,14 +165,14 @@ describe("Tree", []() {
    });
  });

-  describe("edit(InputEdit)", [&]() {
+  describe("edit", [&]() {
    Tree *tree = nullptr;

    before_each([&]() {
-      tree = ts_tree_make_node(cat, 3, tree_array({
-        ts_tree_make_leaf(dog, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
-        ts_tree_make_leaf(eel, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
-        ts_tree_make_leaf(fox, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
+      tree = ts_tree_make_node(symbol1, 3, tree_array({
+        ts_tree_make_leaf(symbol2, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
+        ts_tree_make_leaf(symbol3, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
+        ts_tree_make_leaf(symbol4, {2, 2, {0, 2}}, {3, 3, {0, 3}}, visible),
      }), visible);

      AssertThat(tree->padding, Equals<Length>({2, 2, {0, 2}}));
@ -180,7 +183,6 @@ describe("Tree", []() {
      ts_tree_release(tree);
    });

-
    describe("edits within a tree's padding", [&]() {
      it("resizes the padding of the tree and its leftmost descendants", [&]() {
        TSInputEdit edit;
@ -312,69 +314,124 @@ describe("Tree", []() {
    });
  });

-  describe("equality", [&]() {
+  describe("eq", [&]() {
+    Tree *leaf;
+
+    before_each([&]() {
+      leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, {5, 4, {0, 4}}, visible);
+    });
+
+    after_each([&]() {
+      ts_tree_release(leaf);
+    });
+
    it("returns true for identical trees", [&]() {
-      Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible);
-      AssertThat(ts_tree_eq(tree1, tree1_copy), IsTrue());
+      Tree *leaf_copy = ts_tree_make_leaf(symbol1, {2, 1, {1, 1}}, {5, 4, {1, 4}}, visible);
+      AssertThat(ts_tree_eq(leaf, leaf_copy), IsTrue());

-      Tree *tree2_copy = ts_tree_make_leaf(cat, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible);
-      AssertThat(ts_tree_eq(tree2, tree2_copy), IsTrue());
-
-      Tree *parent2 = ts_tree_make_node(dog, 2, tree_array({
-        tree1_copy,
-        tree2_copy,
+      Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({
+        leaf,
+        leaf_copy,
      }), visible);
+      ts_tree_retain(leaf);
+      ts_tree_retain(leaf_copy);

-      AssertThat(ts_tree_eq(parent1, parent2), IsTrue());
+      Tree *parent_copy = ts_tree_make_node(symbol2, 2, tree_array({
+        leaf,
+        leaf_copy,
+      }), visible);
+      ts_tree_retain(leaf);
+      ts_tree_retain(leaf_copy);

-      ts_tree_release(parent2);
+      AssertThat(ts_tree_eq(parent, parent_copy), IsTrue());
+
+      ts_tree_release(leaf_copy);
+      ts_tree_release(parent);
+      ts_tree_release(parent_copy);
    });

    it("returns false for trees with different symbols", [&]() {
-      Tree *different_tree = ts_tree_make_leaf(
-        tree1->symbol + 1,
-        tree1->padding,
-        tree1->size,
+      Tree *different_leaf = ts_tree_make_leaf(
+        leaf->symbol + 1,
+        leaf->padding,
+        leaf->size,
        visible);

-      AssertThat(ts_tree_eq(tree1, different_tree), IsFalse());
-      ts_tree_release(different_tree);
+      AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse());
+      ts_tree_release(different_leaf);
    });

    it("returns false for trees with different options", [&]() {
-      Tree *tree1_copy = ts_tree_make_leaf(cat, tree1->padding, tree1->size, invisible);
-      AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse());
-      ts_tree_release(tree1_copy);
+      Tree *different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, leaf->size, invisible);
+      AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse());
+      ts_tree_release(different_leaf);
    });

    it("returns false for trees with different sizes", [&]() {
-      Tree *tree1_copy = ts_tree_make_leaf(cat, {2, 1, {0, 1}}, tree1->size, invisible);
-      AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse());
-      ts_tree_release(tree1_copy);
+      Tree *different_leaf = ts_tree_make_leaf(symbol1, {2, 1, {0, 1}}, leaf->size, invisible);
+      AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse());
+      ts_tree_release(different_leaf);

-      tree1_copy = ts_tree_make_leaf(cat, tree1->padding, {5, 4, {1, 10}}, invisible);
-      AssertThat(ts_tree_eq(tree1, tree1_copy), IsFalse());
-      ts_tree_release(tree1_copy);
+      different_leaf = ts_tree_make_leaf(symbol1, leaf->padding, {5, 4, {1, 10}}, invisible);
+      AssertThat(ts_tree_eq(leaf, different_leaf), IsFalse());
+      ts_tree_release(different_leaf);
    });

    it("returns false for trees with different children", [&]() {
-      Tree *different_tree = ts_tree_make_leaf(
-        tree1->symbol + 1,
-        tree1->padding,
-        tree1->size,
-        visible);
+      Tree *leaf2 = ts_tree_make_leaf(symbol2, {1, 1, {0, 1}}, {3, 3, {0, 3}}, visible);

-      ts_tree_retain(different_tree);
-      ts_tree_retain(tree2);
-      Tree *different_parent = ts_tree_make_node(dog, 2, tree_array({
-        different_tree, tree2,
+      Tree *parent = ts_tree_make_node(symbol2, 2, tree_array({
+        leaf,
+        leaf2,
+      }), visible);
+      ts_tree_retain(leaf);
+      ts_tree_retain(leaf2);
+
+      Tree *different_parent = ts_tree_make_node(symbol2, 2, tree_array({
+        leaf2,
+        leaf,
+      }), visible);
+      ts_tree_retain(leaf2);
+      ts_tree_retain(leaf);
+
+      AssertThat(ts_tree_eq(different_parent, parent), IsFalse());
+      AssertThat(ts_tree_eq(parent, different_parent), IsFalse());
+
+      ts_tree_release(leaf2);
+      ts_tree_release(parent);
+      ts_tree_release(different_parent);
+    });
+  });
+
+  describe("last_external_token_state", [&]() {
+    Length padding = {1, 1, {0, 1}};
+    Length size = {2, 2, {0, 2}};
+
+    auto make_external = [](Tree *tree) {
+      tree->has_external_tokens = true;
+      tree->has_external_token_state = true;
+      return tree;
+    };
+
+    it("returns the last serialized external token state in the given tree", [&]() {
+      Tree *tree1, *tree2, *tree3, *tree4, *tree5, *tree6, *tree7, *tree8, *tree9;
+
+      tree1 = ts_tree_make_node(symbol1, 2, tree_array({
+        (tree2 = ts_tree_make_node(symbol2, 3, tree_array({
+          (tree3 = make_external(ts_tree_make_leaf(symbol3, padding, size, visible))),
+          (tree4 = ts_tree_make_leaf(symbol4, padding, size, visible)),
+          (tree5 = ts_tree_make_leaf(symbol5, padding, size, visible)),
+        }), visible)),
+        (tree6 = ts_tree_make_node(symbol6, 2, tree_array({
+          (tree7 = ts_tree_make_node(symbol7, 1, tree_array({
+            (tree8 = ts_tree_make_leaf(symbol8, padding, size, visible)),
+          }), visible)),
+          (tree9 = ts_tree_make_leaf(symbol9, padding, size, visible)),
+        }), visible)),
      }), visible);

-      AssertThat(ts_tree_eq(different_parent, parent1), IsFalse());
-      AssertThat(ts_tree_eq(parent1, different_parent), IsFalse());
-
-      ts_tree_release(different_tree);
-      ts_tree_release(different_parent);
+      auto state = ts_tree_last_external_token_state(tree1);
+      AssertThat(state, Equals(&tree3->external_token_state));
    });
  });
 });
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -64,7 +64,7 @@ class LexTableBuilder {
 private:
  void add_lex_state_for_parse_state(ParseState *parse_state) {
    parse_state->lex_state_id =
-      add_lex_state(item_set_for_tokens(parse_state->expected_inputs()));
+      add_lex_state(item_set_for_terminals(parse_state->terminal_entries));
  }

  LexStateId add_lex_state(const LexItemSet &item_set) {
@ -112,24 +112,27 @@ class LexTableBuilder {
  void mark_fragile_tokens() {
    for (ParseState &state : parse_table->states) {
      for (auto &entry : state.terminal_entries) {
-        auto homonyms = conflict_manager.possible_homonyms.find(entry.first);
-        if (homonyms != conflict_manager.possible_homonyms.end())
-          for (Symbol::Index homonym : homonyms->second)
-            if (state.terminal_entries.count(homonym)) {
-              entry.second.reusable = false;
-              break;
-            }
+        Symbol symbol = entry.first;
+        if (symbol.is_token()) {
+          auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
+          if (homonyms != conflict_manager.possible_homonyms.end())
+            for (Symbol::Index homonym : homonyms->second)
+              if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
+                entry.second.reusable = false;
+                break;
+              }

-        if (!entry.second.reusable)
-          continue;
+          if (!entry.second.reusable)
+            continue;

-        auto extensions = conflict_manager.possible_extensions.find(entry.first);
-        if (extensions != conflict_manager.possible_extensions.end())
-          for (Symbol::Index extension : extensions->second)
-            if (state.terminal_entries.count(extension)) {
-              entry.second.depends_on_lookahead = true;
-              break;
-            }
+          auto extensions = conflict_manager.possible_extensions.find(symbol.index);
+          if (extensions != conflict_manager.possible_extensions.end())
+            for (Symbol::Index extension : extensions->second)
+              if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
+                entry.second.depends_on_lookahead = true;
+                break;
+              }
+        }
      }
    }
  }
@ -150,24 +153,27 @@ class LexTableBuilder {
    }
  }

-  LexItemSet item_set_for_tokens(const set<Symbol> &symbols) {
+  LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
    LexItemSet result;
-    for (const Symbol &symbol : symbols)
-      for (const rule_ptr &rule : rules_for_symbol(symbol))
-        for (const rule_ptr &separator_rule : separator_rules)
-          result.entries.insert(LexItem(
-            symbol,
-            Metadata::separator(
-              Seq::build({
-                separator_rule,
-                Metadata::main_token(rule) }))));
+    for (const auto &pair : terminals) {
+      Symbol symbol = pair.first;
+      if (symbol.is_token()) {
+        for (const rule_ptr &rule : rules_for_symbol(symbol)) {
+          for (const rule_ptr &separator_rule : separator_rules) {
+            result.entries.insert(LexItem(
+              symbol,
+              Metadata::separator(
+                Seq::build({
+                  separator_rule,
+                  Metadata::main_token(rule) }))));
+          }
+        }
+      }
+    }
    return result;
  }

  vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
-    if (!symbol.is_token)
-      return {};
-
    if (symbol == rules::END_OF_INPUT())
      return { CharacterSet().include(0).copy() };

--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -52,7 +52,10 @@ class ParseTableBuilder {
        allow_any_conflict(false) {}

  pair<ParseTable, CompileError> build() {
-    Symbol start_symbol = Symbol(0, grammar.variables.empty());
+    Symbol start_symbol = grammar.variables.empty() ?
+      Symbol(0, Symbol::Terminal) :
+      Symbol(0, Symbol::NonTerminal);
+
    Production start_production({
      ProductionStep(start_symbol, 0, rules::AssociativityNone),
    });
@ -63,7 +66,7 @@ class ParseTableBuilder {
    add_parse_state(ParseItemSet({
      {
        ParseItem(rules::START(), start_production, 0),
-        LookaheadSet({ END_OF_INPUT().index }),
+        LookaheadSet({ END_OF_INPUT() }),
      },
    }));

@ -107,21 +110,25 @@ class ParseTableBuilder {
  void build_error_parse_state() {
    ParseState error_state;

-    for (const Symbol::Index index : parse_table.mergeable_symbols) {
-      add_out_of_context_parse_state(&error_state, Symbol(index, true));
+    for (const Symbol symbol : parse_table.mergeable_symbols) {
+      add_out_of_context_parse_state(&error_state, symbol);
    }

    for (const Symbol &symbol : grammar.extra_tokens) {
-      if (!error_state.terminal_entries.count(symbol.index)) {
-        error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra());
+      if (!error_state.terminal_entries.count(symbol)) {
+        error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra());
      }
    }

-    for (size_t i = 0; i < grammar.variables.size(); i++) {
-      add_out_of_context_parse_state(&error_state, Symbol(i, false));
+    for (size_t i = 0; i < grammar.external_tokens.size(); i++) {
+      add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::External));
    }

-    error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0));
+    for (size_t i = 0; i < grammar.variables.size(); i++) {
+      add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal));
+    }
+
+    error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0));
    parse_table.states[0] = error_state;
  }

@ -130,10 +137,10 @@ class ParseTableBuilder {
    const ParseItemSet &item_set = recovery_states[symbol];
    if (!item_set.entries.empty()) {
      ParseStateId state = add_parse_state(item_set);
-      if (symbol.is_token) {
-        error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) });
-      } else {
+      if (symbol.is_non_terminal()) {
        error_state->nonterminal_entries[symbol.index] = state;
+      } else {
+        error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) });
      }
    }
  }
@ -152,9 +159,9 @@ class ParseTableBuilder {
  }

  string add_actions(const ParseItemSet &item_set, ParseStateId state_id) {
-    map<Symbol::Index, ParseItemSet> terminal_successors;
+    map<Symbol, ParseItemSet> terminal_successors;
    map<Symbol::Index, ParseItemSet> nonterminal_successors;
-    set<Symbol::Index> lookaheads_with_conflicts;
+    set<Symbol> lookaheads_with_conflicts;

    for (const auto &pair : item_set.entries) {
      const ParseItem &item = pair.first;
@ -168,7 +175,7 @@ class ParseTableBuilder {
          ParseAction::Reduce(item.lhs(), item.step_index, *item.production);

        int precedence = item.precedence();
-        for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
+        for (Symbol lookahead : *lookahead_symbols.entries) {
          ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];

          // Only add the highest-precedence Reduce actions to the parse table.
@ -203,10 +210,10 @@ class ParseTableBuilder {
        Symbol symbol = item.production->at(item.step_index).symbol;
        ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);

-        if (symbol.is_token) {
-          terminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
-        } else {
+        if (symbol.is_non_terminal()) {
          nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
+        } else {
+          terminal_successors[symbol].entries[new_item] = lookahead_symbols;
        }
      }
    }
@ -214,7 +221,7 @@ class ParseTableBuilder {
    // Add a Shift action for each possible successor state. Shift actions for
    // terminal lookaheads can conflict with Reduce actions added previously.
    for (auto &pair : terminal_successors) {
-      Symbol::Index lookahead = pair.first;
+      Symbol lookahead = pair.first;
      ParseItemSet &next_item_set = pair.second;
      ParseStateId next_state_id = add_parse_state(next_item_set);
      ParseState &state = parse_table.states[state_id];
@ -223,7 +230,7 @@ class ParseTableBuilder {
      if (!allow_any_conflict) {
        if (had_existing_action)
          lookaheads_with_conflicts.insert(lookahead);
-        recovery_states[Symbol(lookahead, true)].add(next_item_set);
+        recovery_states[lookahead].add(next_item_set);
      }
    }

@ -234,10 +241,10 @@ class ParseTableBuilder {
      ParseStateId next_state = add_parse_state(next_item_set);
      parse_table.set_nonterminal_action(state_id, lookahead, next_state);
      if (!allow_any_conflict)
-        recovery_states[Symbol(lookahead, false)].add(next_item_set);
+        recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set);
    }

-    for (Symbol::Index lookahead : lookaheads_with_conflicts) {
+    for (Symbol lookahead : lookaheads_with_conflicts) {
      string conflict = handle_conflict(item_set, state_id, lookahead);
      if (!conflict.empty()) return conflict;
    }
@ -245,9 +252,9 @@ class ParseTableBuilder {
    ParseAction shift_extra = ParseAction::ShiftExtra();
    ParseState &state = parse_table.states[state_id];
    for (const Symbol &extra_symbol : grammar.extra_tokens) {
-      if (!state.terminal_entries.count(extra_symbol.index) ||
+      if (!state.terminal_entries.count(extra_symbol) ||
          state.has_shift_action() || allow_any_conflict) {
-        parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra);
+        parse_table.add_terminal_action(state_id, extra_symbol, shift_extra);
      }
    }

@ -257,7 +264,6 @@ class ParseTableBuilder {
  void mark_fragile_actions() {
    for (ParseState &state : parse_table.states) {
      for (auto &entry : state.terminal_entries) {
-        const Symbol symbol(entry.first, true);
        auto &actions = entry.second.actions;

        for (ParseAction &action : actions) {
@ -359,7 +365,7 @@ class ParseTableBuilder {
  }

  string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
-                         Symbol::Index lookahead) {
+                         Symbol lookahead) {
    ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
    int reduction_precedence = entry.actions.front().precedence();
    set<ParseItem> shift_items;
@ -468,7 +474,7 @@ class ParseTableBuilder {
      description += "  " + symbol_name(earliest_starting_item.production->at(i).symbol);
    }

-    description += "  \u2022  " + symbol_name(Symbol(lookahead, true)) + "  \u2026";
+    description += "  \u2022  " + symbol_name(lookahead) + "  \u2026";
    description += "\n\n";

    description += "Possible interpretations:\n\n";
@ -487,7 +493,7 @@ class ParseTableBuilder {
          description += "  " + symbol_name(step.symbol);
        }
        description += ")";
-        description += "  \u2022  " + symbol_name(Symbol(lookahead, true)) + "  \u2026";
+        description += "  \u2022  " + symbol_name(lookahead) + "  \u2026";
        description += "\n";
      }
    }
@ -564,14 +570,23 @@ class ParseTableBuilder {
        return "END_OF_INPUT";
      else
        return "";
-    } else if (symbol.is_token) {
-      const Variable &variable = lexical_grammar.variables[symbol.index];
-      if (variable.type == VariableTypeNamed)
-        return variable.name;
-      else
-        return "'" + variable.name + "'";
-    } else {
-      return grammar.variables[symbol.index].name;
+    }
+
+    switch (symbol.type) {
+      case Symbol::Terminal: {
+        const Variable &variable = lexical_grammar.variables[symbol.index];
+        if (variable.type == VariableTypeNamed)
+          return variable.name;
+        else
+          return "'" + variable.name + "'";
+      }
+      case Symbol::NonTerminal: {
+        return grammar.variables[symbol.index].name;
+      }
+      case Symbol::External:
+      default: {
+        return grammar.external_tokens[symbol.index].name;
+      }
    }
  }

--- a/src/compiler/build_tables/lookahead_set.cc
+++ b/src/compiler/build_tables/lookahead_set.cc
@ -12,8 +12,8 @@ using rules::Symbol;

 LookaheadSet::LookaheadSet() : entries(nullptr) {}

-LookaheadSet::LookaheadSet(const set<Symbol::Index> &symbols)
-    : entries(make_shared<set<Symbol::Index>>(symbols)) {}
+LookaheadSet::LookaheadSet(const set<Symbol> &symbols)
+    : entries(make_shared<set<Symbol>>(symbols)) {}

 bool LookaheadSet::empty() const {
  return !entries.get() || entries->empty();
@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const {
  return *entries == *other.entries;
 }

-bool LookaheadSet::contains(const Symbol::Index &symbol) const {
+bool LookaheadSet::contains(const Symbol &symbol) const {
  return entries->find(symbol) != entries->end();
 }

@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) {
  if (!other.entries.get())
    return false;
  if (!entries.get())
-    entries = make_shared<set<Symbol::Index>>();
+    entries = make_shared<set<Symbol>>();
  size_t previous_size = entries->size();
  entries->insert(other.entries->begin(), other.entries->end());
  return entries->size() > previous_size;
 }

-bool LookaheadSet::insert(const Symbol::Index &symbol) {
+bool LookaheadSet::insert(const Symbol &symbol) {
  if (!entries.get())
-    entries = make_shared<set<Symbol::Index>>();
+    entries = make_shared<set<Symbol>>();
  return entries->insert(symbol).second;
 }

--- a/src/compiler/build_tables/lookahead_set.h
+++ b/src/compiler/build_tables/lookahead_set.h
@ -11,15 +11,15 @@ namespace build_tables {
 class LookaheadSet {
 public:
  LookaheadSet();
-  explicit LookaheadSet(const std::set<rules::Symbol::Index> &);
+  explicit LookaheadSet(const std::set<rules::Symbol> &);

  bool empty() const;
  bool operator==(const LookaheadSet &) const;
-  bool contains(const rules::Symbol::Index &) const;
+  bool contains(const rules::Symbol &) const;
  bool insert_all(const LookaheadSet &);
-  bool insert(const rules::Symbol::Index &);
+  bool insert(const rules::Symbol &);

-  std::shared_ptr<std::set<rules::Symbol::Index>> entries;
+  std::shared_ptr<std::set<rules::Symbol>> entries;
 };

 }  // namespace build_tables
--- a/src/compiler/build_tables/parse_item.cc
+++ b/src/compiler/build_tables/parse_item.cc
@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const {
 }

 Symbol ParseItem::lhs() const {
-  return Symbol(variable_index);
+  return Symbol(variable_index, Symbol::NonTerminal);
 }

 bool ParseItem::is_done() const {
@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const {
  return result;
 }

-ParseItemSet::ActionMap ParseItemSet::actions() const {
-  ParseItemSet::ActionMap result;
-
-  for (const auto &pair : entries) {
-    const ParseItem &item = pair.first;
-    const LookaheadSet &lookahead_symbols = pair.second;
-
-    if (item.step_index == item.production->size()) {
-      int precedence = item.precedence();
-      for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
-        Action &action = result.terminal_actions[lookahead];
-        if (precedence > action.completion_precedence) {
-          action.completions.assign({ &item });
-        } else if (precedence == action.completion_precedence) {
-          action.completions.push_back({ &item });
-        }
-      }
-    } else {
-      Symbol symbol = item.production->at(item.step_index).symbol;
-      ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
-
-      if (symbol.is_token) {
-        result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols;
-      } else {
-        result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols;
-      }
-    }
-  }
-
-  return result;
-}
-
 void ParseItemSet::add(const ParseItemSet &other) {
  for (const auto &pair : other.entries)
    entries[pair.first].insert_all(pair.second);
--- a/src/compiler/build_tables/parse_item.h
+++ b/src/compiler/build_tables/parse_item.h
@ -41,16 +41,6 @@ class ParseItemSet {
  ParseItemSet();
  explicit ParseItemSet(const std::map<ParseItem, LookaheadSet> &);

-  struct Completion;
-  struct Action;
-
-  struct ActionMap {
-    std::map<rules::Symbol::Index, Action> terminal_actions;
-    std::map<rules::Symbol::Index, ParseItemSet> nonterminal_continuations;
-  };
-
-  ActionMap actions() const;
-
  bool operator==(const ParseItemSet &) const;
  void add(const ParseItemSet &);
  size_t unfinished_item_signature() const;
@ -58,22 +48,6 @@ class ParseItemSet {
  std::map<ParseItem, LookaheadSet> entries;
 };

-struct ParseItemSet::Completion {
-  const ParseItem *item;
-  int precedence;
-  rules::Associativity associativity;
-
-  bool operator<(const ParseItemSet::Completion &other) {
-    return precedence < other.precedence;
-  }
-};
-
-struct ParseItemSet::Action {
-  ParseItemSet continuation;
-  std::vector<const ParseItem *> completions;
-  int completion_precedence;
-};
-
 }  // namespace build_tables
 }  // namespace tree_sitter

--- a/src/compiler/build_tables/parse_item_set_builder.cc
+++ b/src/compiler/build_tables/parse_item_set_builder.cc
@ -27,12 +27,17 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
  set<Symbol::Index> processed_non_terminals;

  for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i, true);
-    first_sets.insert({symbol, LookaheadSet({ static_cast<Symbol::Index>(i) })});
+    Symbol symbol(i, Symbol::Terminal);
+    first_sets.insert({symbol, LookaheadSet({ symbol })});
+  }
+
+  for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) {
+    Symbol symbol(i, Symbol::External);
+    first_sets.insert({symbol, LookaheadSet({ symbol })});
  }

  for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i);
+    Symbol symbol(i, Symbol::NonTerminal);
    LookaheadSet first_set;

    processed_non_terminals.clear();
@ -42,10 +47,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
      Symbol current_symbol = symbols_to_process.back();
      symbols_to_process.pop_back();

-      if (current_symbol.is_token) {
-        first_set.insert(current_symbol.index);
+      if (!current_symbol.is_non_terminal()) {
+        first_set.insert(current_symbol);
      } else if (processed_non_terminals.insert(current_symbol.index).second) {
-        for (const Production &production : grammar.productions(current_symbol)) {
+        for (const Production &production : grammar.variables[current_symbol.index].productions) {
          if (!production.empty()) {
            symbols_to_process.push_back(production[0].symbol);
          }
@ -59,11 +64,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
  vector<ParseItemSetComponent> components_to_process;

  for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i);
+    Symbol symbol(i, Symbol::NonTerminal);
    map<ParseItem, pair<LookaheadSet, bool>> cache_entry;

    components_to_process.clear();
-    for (const Production &production : grammar.productions(symbol)) {
+    for (const Production &production : grammar.variables[i].productions) {
      components_to_process.push_back(ParseItemSetComponent{
        ParseItem(symbol, production, 0),
        LookaheadSet(),
@ -87,7 +92,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,

      if (component_is_new) {
        Symbol next_symbol = item.next_symbol();
-        if (next_symbol.is_built_in() || next_symbol.is_token)
+        if (!next_symbol.is_non_terminal() || next_symbol.is_built_in())
          continue;

        LookaheadSet next_lookaheads;
@ -102,7 +107,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
          propagates_lookaheads = false;
        }

-        for (const Production &production : grammar.productions(next_symbol)) {
+        for (const Production &production : grammar.variables[next_symbol.index].productions) {
          components_to_process.push_back(ParseItemSetComponent{
            ParseItem(next_symbol, production, 0),
            next_lookaheads,
@ -130,7 +135,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
    const LookaheadSet &lookaheads = pair.second;

    const Symbol &next_symbol = item.next_symbol();
-    if (!next_symbol.is_token && !next_symbol.is_built_in()) {
+    if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) {
      LookaheadSet next_lookaheads;
      size_t next_step = item.step_index + 1;
      if (next_step == item.production->size()) {
--- a/src/compiler/build_tables/recovery_tokens.cc
+++ b/src/compiler/build_tables/recovery_tokens.cc
@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator<true, false> {};
 class LastCharacters : public CharacterAggregator<false, true> {};
 class AllCharacters : public CharacterAggregator<true, true> {};

-set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
-  set<Symbol::Index> result;
+set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
+  set<Symbol> result;

  AllCharacters all_separator_characters;
  for (const rule_ptr &separator : grammar.separators)
@ -79,7 +79,7 @@ set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
      !all_characters.result.intersects(all_separator_characters.result);

    if ((has_distinct_start && has_distinct_end) || has_no_separators)
-      result.insert(i);
+      result.insert(Symbol(i, Symbol::Terminal));
  }

  return result;
--- a/src/compiler/build_tables/recovery_tokens.h
+++ b/src/compiler/build_tables/recovery_tokens.h
@ -11,7 +11,7 @@ struct LexicalGrammar;

 namespace build_tables {

-std::set<rules::Symbol::Index> recovery_tokens(const LexicalGrammar &);
+std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);

 }  // namespace build_tables
 }  // namespace tree_sitter
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -11,9 +11,11 @@
 #include "compiler/lexical_grammar.h"
 #include "compiler/rules/built_in_symbols.h"
 #include "compiler/util/string_helpers.h"
+#include "tree_sitter/runtime.h"

 namespace tree_sitter {
 namespace generate_code {
+
 using std::function;
 using std::map;
 using std::pair;
@ -22,6 +24,7 @@ using std::string;
 using std::to_string;
 using std::vector;
 using util::escape_char;
+using rules::Symbol;

 static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());

@ -73,9 +76,8 @@ class CCodeGenerator {
  const LexicalGrammar lexical_grammar;
  map<string, string> sanitized_names;
  vector<pair<size_t, ParseTableEntry>> parse_table_entries;
-  vector<pair<size_t, set<rules::Symbol>>> in_progress_symbols;
+  vector<set<Symbol::Index>> external_scanner_states;
  size_t next_parse_action_list_index;
-  size_t next_in_progress_symbol_list_index;

 public:
  CCodeGenerator(string name, const ParseTable &parse_table,
@ -87,19 +89,26 @@ class CCodeGenerator {
        lex_table(lex_table),
        syntax_grammar(syntax_grammar),
        lexical_grammar(lexical_grammar),
-        next_parse_action_list_index(0),
-        next_in_progress_symbol_list_index(0) {}
+        next_parse_action_list_index(0) {}

  string code() {
    buffer = "";

    add_includes();
-    add_state_and_symbol_counts();
+    add_warning_pragma();
+    add_stats();
    add_symbol_enum();
    add_symbol_names_list();
-    add_symbol_node_types_list();
+    add_symbol_metadata_list();
    add_lex_function();
-    add_lex_states_list();
+    add_lex_modes_list();
+
+    if (!syntax_grammar.external_tokens.empty()) {
+      add_external_token_enum();
+      add_external_scanner_symbol_map();
+      add_external_scanner_states_list();
+    }
+
    add_parse_table();
    add_parser_export();

@ -112,10 +121,25 @@ class CCodeGenerator {
    line();
  }

-  void add_state_and_symbol_counts() {
+  void add_warning_pragma() {
+    line("#pragma GCC diagnostic push");
+    line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
+    line();
+  }
+
+  void add_stats() {
+    size_t token_count = 1 + lexical_grammar.variables.size();
+    for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
+      if (external_token.corresponding_internal_token == rules::NONE()) {
+        token_count++;
+      }
+    }
+
+    line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION));
    line("#define STATE_COUNT " + to_string(parse_table.states.size()));
    line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
-    line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1));
+    line("#define TOKEN_COUNT " + to_string(token_count));
+    line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size()));
    line();
  }

@ -124,7 +148,7 @@ class CCodeGenerator {
    indent([&]() {
      size_t i = 1;
      for (const auto &entry : parse_table.symbols) {
-        const rules::Symbol &symbol = entry.first;
+        const Symbol &symbol = entry.first;
        if (!symbol.is_built_in()) {
          line(symbol_id(symbol) + " = " + to_string(i) + ",");
          i++;
@ -146,11 +170,11 @@ class CCodeGenerator {
    line();
  }

-  void add_symbol_node_types_list() {
+  void add_symbol_metadata_list() {
    line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {");
    indent([&]() {
      for (const auto &entry : parse_table.symbols) {
-        const rules::Symbol &symbol = entry.first;
+        const Symbol &symbol = entry.first;
        line("[" + symbol_id(symbol) + "] = {");
        indent([&]() {
          switch (symbol_type(symbol)) {
@ -198,13 +222,102 @@ class CCodeGenerator {
    line();
  }

-  void add_lex_states_list() {
-    line("static TSStateId ts_lex_states[STATE_COUNT] = {");
+  void add_lex_modes_list() {
+    add_external_scanner_state({});
+
+    map<Symbol::Index, Symbol::Index> external_tokens_by_corresponding_internal_token;
+    for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
+      for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) {
+        const ExternalToken &external_token = syntax_grammar.external_tokens[j];
+        if (external_token.corresponding_internal_token.index == Symbol::Index(i)) {
+          external_tokens_by_corresponding_internal_token.insert({i, j});
+          break;
+        }
+      }
+    }
+
+    line("static TSLexMode ts_lex_modes[STATE_COUNT] = {");
    indent([&]() {
      size_t state_id = 0;
-      for (const auto &state : parse_table.states)
-        line("[" + to_string(state_id++) + "] = " +
-             to_string(state.lex_state_id) + ",");
+
+      for (const auto &state : parse_table.states) {
+        line("[" + to_string(state_id++) + "] = {.lex_state = ");
+        add(to_string(state.lex_state_id));
+
+        bool needs_external_scanner = false;
+        set<Symbol::Index> external_token_indices;
+        for (const auto &pair : state.terminal_entries) {
+          Symbol symbol = pair.first;
+          if (symbol.is_external()) {
+            needs_external_scanner = true;
+            external_token_indices.insert(symbol.index);
+          } else if (symbol.is_token()) {
+            auto corresponding_external_token =
+              external_tokens_by_corresponding_internal_token.find(symbol.index);
+            if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) {
+              external_token_indices.insert(corresponding_external_token->second);
+            }
+          }
+        }
+
+        if (needs_external_scanner) {
+          add(", .external_lex_state = " + add_external_scanner_state(external_token_indices));
+        }
+
+        add("},");
+      }
+    });
+    line("};");
+    line();
+  }
+
+  string add_external_scanner_state(set<Symbol::Index> external_token_ids) {
+    for (size_t i = 0, n = external_scanner_states.size(); i < n; i++)
+      if (external_scanner_states[i] == external_token_ids)
+        return to_string(i);
+    external_scanner_states.push_back(external_token_ids);
+    return to_string(external_scanner_states.size() - 1);
+  }
+
+  void add_external_token_enum() {
+    line("enum {");
+    indent([&]() {
+      for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++)
+        line(external_token_id(i) + ",");
+    });
+    line("};");
+    line();
+  }
+
+  void add_external_scanner_symbol_map() {
+    line("TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {");
+    indent([&]() {
+      for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
+        line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ",");
+      }
+    });
+    line("};");
+    line();
+  }
+
+  void add_external_scanner_states_list() {
+    line("static bool ts_external_scanner_states[");
+    add(to_string(external_scanner_states.size()));
+    add("][EXTERNAL_TOKEN_COUNT] = {");
+    indent([&]() {
+      size_t i = 0;
+      for (const auto &valid_external_lookaheads : external_scanner_states) {
+        if (!valid_external_lookaheads.empty()) {
+          line("[" + to_string(i) + "] = {");
+          indent([&]() {
+            for (Symbol::Index id : valid_external_lookaheads) {
+              line("[" + external_token_id(id) + "] = true,");
+            }
+          });
+          line("},");
+        }
+        i++;
+      }
    });
    line("};");
    line();
@ -214,9 +327,6 @@ class CCodeGenerator {
    add_parse_action_list_id(ParseTableEntry{ {}, false, false });

    size_t state_id = 0;
-    line("#pragma GCC diagnostic push");
-    line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
-    line();
    line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {");

    indent([&]() {
@ -224,12 +334,12 @@ class CCodeGenerator {
        line("[" + to_string(state_id++) + "] = {");
        indent([&]() {
          for (const auto &entry : state.nonterminal_entries) {
-            line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE(");
+            line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE(");
            add(to_string(entry.second));
            add("),");
          }
          for (const auto &entry : state.terminal_entries) {
-            line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS(");
+            line("[" + symbol_id(entry.first) + "] = ACTIONS(");
            add(to_string(add_parse_action_list_id(entry.second)));
            add("),");
          }
@ -242,12 +352,42 @@ class CCodeGenerator {
    line();
    add_parse_action_list();
    line();
-    line("#pragma GCC diagnostic pop");
-    line();
  }

  void add_parser_export() {
-    line("EXPORT_LANGUAGE(ts_language_" + name + ");");
+    string language_function_name = "tree_sitter_" + name;
+    string external_scanner_name = language_function_name + "_external_scanner";
+
+    if (!syntax_grammar.external_tokens.empty()) {
+      line("void *" + external_scanner_name + "_create();");
+      line("void " + external_scanner_name + "_destroy();");
+      line("void " + external_scanner_name + "_reset(void *);");
+      line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);");
+      line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);");
+      line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);");
+      line();
+    }
+
+    line("const TSLanguage *" + language_function_name + "() {");
+    indent([&]() {
+      line("GET_LANGUAGE(");
+      if (syntax_grammar.external_tokens.empty()) {
+        add(");");
+      } else {
+        indent([&]() {
+          line("(const bool *)ts_external_scanner_states,");
+          line("ts_external_scanner_symbol_map,");
+          line(external_scanner_name + "_create,");
+          line(external_scanner_name + "_destroy,");
+          line(external_scanner_name + "_reset,");
+          line(external_scanner_name + "_scan,");
+          line(external_scanner_name + "_serialize,");
+          line(external_scanner_name + "_deserialize,");
+        });
+        line(");");
+      }
+    });
+    line("}");
    line();
  }

@ -379,22 +519,13 @@ class CCodeGenerator {
    return result;
  }

-  size_t add_in_progress_symbol_list_id(const set<rules::Symbol> &symbols) {
-    for (const auto &pair : in_progress_symbols) {
-      if (pair.second == symbols) {
-        return pair.first;
-      }
-    }
-
-    size_t result = next_in_progress_symbol_list_index;
-    in_progress_symbols.push_back({ result, symbols });
-    next_in_progress_symbol_list_index += 1 + symbols.size();
-    return result;
-  }
-
  // Helper functions

-  string symbol_id(const rules::Symbol &symbol) {
+  string external_token_id(Symbol::Index index) {
+    return "ts_external_token_" + syntax_grammar.external_tokens[index].name;
+  }
+
+  string symbol_id(const Symbol &symbol) {
    if (symbol == rules::END_OF_INPUT())
      return "ts_builtin_sym_end";

@ -411,25 +542,33 @@ class CCodeGenerator {
    }
  }

-  string symbol_name(const rules::Symbol &symbol) {
+  string symbol_name(const Symbol &symbol) {
    if (symbol == rules::END_OF_INPUT())
      return "END";
    return entry_for_symbol(symbol).first;
  }

-  VariableType symbol_type(const rules::Symbol &symbol) {
+  VariableType symbol_type(const Symbol &symbol) {
    if (symbol == rules::END_OF_INPUT())
      return VariableTypeHidden;
    return entry_for_symbol(symbol).second;
  }

-  pair<string, VariableType> entry_for_symbol(const rules::Symbol &symbol) {
-    if (symbol.is_token) {
-      const Variable &variable = lexical_grammar.variables[symbol.index];
-      return { variable.name, variable.type };
-    } else {
-      const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
-      return { variable.name, variable.type };
+  pair<string, VariableType> entry_for_symbol(const Symbol &symbol) {
+    switch (symbol.type) {
+      case Symbol::NonTerminal: {
+        const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
+        return { variable.name, variable.type };
+      }
+      case Symbol::Terminal: {
+        const Variable &variable = lexical_grammar.variables[symbol.index];
+        return { variable.name, variable.type };
+      }
+      case Symbol::External:
+      default: {
+        const ExternalToken &token = syntax_grammar.external_tokens[symbol.index];
+        return { token.name, token.type };
+      }
    }
  }

--- a/src/compiler/grammar.h
+++ b/src/compiler/grammar.h
@ -12,6 +12,7 @@ struct Grammar {
  std::vector<std::pair<std::string, rule_ptr>> rules;
  std::vector<rule_ptr> extra_tokens;
  std::vector<std::vector<std::string>> expected_conflicts;
+  std::vector<std::string> external_tokens;
 };

 }  // namespace tree_sitter
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@ -210,7 +210,7 @@ ParseGrammarResult parse_grammar(const string &input) {
  string error_message;
  string name;
  Grammar grammar;
-  json_value name_json, rules_json, extras_json, conflicts_json;
+  json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json;

  json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
  char parse_error[json_error_max];
@ -302,6 +302,25 @@ ParseGrammarResult parse_grammar(const string &input) {
    }
  }

+  external_tokens_json = grammar_json->operator[]("externals");
+  if (external_tokens_json.type != json_none) {
+    if (external_tokens_json.type != json_array) {
+      error_message = "External tokens must be an array";
+      goto error;
+    }
+
+    for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) {
+      json_value *token_name_json = external_tokens_json.u.array.values[i];
+      if (token_name_json->type != json_string) {
+        error_message = "External token values must be strings";
+        goto error;
+      }
+
+      string token_name = token_name_json->u.string.ptr;
+      grammar.external_tokens.push_back(token_name);
+    }
+  }
+
  json_value_free(grammar_json);
  return { name, grammar, "" };

--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@ -1,6 +1,7 @@
 #include "compiler/parse_table.h"
 #include <string>
 #include "compiler/precedence_range.h"
+#include "compiler/rules/built_in_symbols.h"

 namespace tree_sitter {

@ -28,7 +29,7 @@ ParseAction::ParseAction()
      extra(false),
      fragile(false),
      state_index(-1),
-      symbol(Symbol(-1)),
+      symbol(rules::NONE()),
      consumed_symbol_count(0),
      production(nullptr) {}

@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() {
 }

 ParseAction ParseAction::Shift(ParseStateId state_index) {
-  return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr);
+  return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr);
 }

 ParseAction ParseAction::Recover(ParseStateId state_index) {
-  return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0,
+  return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0,
                     nullptr);
 }

@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const {
 set<Symbol> ParseState::expected_inputs() const {
  set<Symbol> result;
  for (auto &entry : terminal_entries)
-    result.insert(Symbol(entry.first, true));
-  for (auto &entry : nonterminal_entries)
-    result.insert(Symbol(entry.first, false));
+    result.insert(entry.first);
  return result;
 }

@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() {
  return states.size() - 1;
 }

-ParseAction &ParseTable::set_terminal_action(ParseStateId state_id,
-                                             Symbol::Index index,
-                                             ParseAction action) {
-  states[state_id].terminal_entries[index].actions.clear();
-  return add_terminal_action(state_id, index, action);
-}
-
 ParseAction &ParseTable::add_terminal_action(ParseStateId state_id,
-                                             Symbol::Index index,
+                                             Symbol lookahead,
                                             ParseAction action) {
-  Symbol symbol(index, true);
  if (action.type == ParseActionTypeShift && action.extra)
-    symbols[symbol].extra = true;
+    symbols[lookahead].extra = true;
  else
-    symbols[symbol].structural = true;
+    symbols[lookahead].structural = true;

-  ParseTableEntry &entry = states[state_id].terminal_entries[index];
+  ParseTableEntry &entry = states[state_id].terminal_entries[lookahead];
  entry.actions.push_back(action);
  return *entry.actions.rbegin();
 }

 void ParseTable::set_nonterminal_action(ParseStateId state_id,
-                                        Symbol::Index index,
+                                        Symbol::Index lookahead,
                                        ParseStateId next_state_id) {
-  Symbol symbol(index, false);
-  symbols[symbol].structural = true;
-  states[state_id].nonterminal_entries[index] = next_state_id;
+  symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true;
+  states[state_id].nonterminal_entries[lookahead] = next_state_id;
 }

 static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) {
    return false;

  for (auto &entry : state.terminal_entries) {
-    Symbol::Index index = entry.first;
+    Symbol lookahead = entry.first;
    const vector<ParseAction> &actions = entry.second.actions;

-    const auto &other_entry = other.terminal_entries.find(index);
+    const auto &other_entry = other.terminal_entries.find(lookahead);
    if (other_entry == other.terminal_entries.end()) {
-      if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
+      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
        return false;
      if (actions.back().type != ParseActionTypeReduce)
        return false;
@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) {
    }
  }

-  set<Symbol::Index> symbols_to_merge;
+  set<Symbol> symbols_to_merge;

  for (auto &entry : other.terminal_entries) {
-    Symbol::Index index = entry.first;
+    Symbol lookahead = entry.first;
    const vector<ParseAction> &actions = entry.second.actions;

-    if (!state.terminal_entries.count(index)) {
-      if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
+    if (!state.terminal_entries.count(lookahead)) {
+      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
        return false;
      if (actions.back().type != ParseActionTypeReduce)
        return false;
      if (!has_entry(state, entry.second))
        return false;
-      symbols_to_merge.insert(index);
+      symbols_to_merge.insert(lookahead);
    }
  }

-  for (const Symbol::Index &index : symbols_to_merge)
-    state.terminal_entries[index] = other.terminal_entries.find(index)->second;
+  for (const Symbol &lookahead : symbols_to_merge)
+    state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;

  return true;
 }
--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@ -76,7 +76,7 @@ class ParseState {
  void each_referenced_state(std::function<void(ParseStateId *)>);
  bool has_shift_action() const;

-  std::map<rules::Symbol::Index, ParseTableEntry> terminal_entries;
+  std::map<rules::Symbol, ParseTableEntry> terminal_entries;
  std::map<rules::Symbol::Index, ParseStateId> nonterminal_entries;
  LexStateId lex_state_id;
  size_t shift_actions_signature;
@ -91,15 +91,14 @@ class ParseTable {
 public:
  std::set<rules::Symbol> all_symbols() const;
  ParseStateId add_state();
-  ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction);
-  ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction);
-  void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId);
+  ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
+  void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
  bool merge_state(size_t i, size_t j);

  std::vector<ParseState> states;
  std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;

-  std::set<rules::Symbol::Index> mergeable_symbols;
+  std::set<rules::Symbol> mergeable_symbols;
 };

 }  // namespace tree_sitter
--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn {
    rule_ptr inner_rule = apply(rule->content);
    size_t index = aux_rules.size();
    string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count);
-    Symbol repeat_symbol(offset + index);
+    Symbol repeat_symbol(offset + index, Symbol::NonTerminal);
    existing_repeats.push_back({ rule->copy(), repeat_symbol });
    aux_rules.push_back(
      Variable(helper_rule_name, VariableTypeAuxiliary,
@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
  result.variables = grammar.variables;
  result.extra_tokens = grammar.extra_tokens;
  result.expected_conflicts = grammar.expected_conflicts;
+  result.external_tokens = grammar.external_tokens;

  ExpandRepeats expander(result.variables.size());
  for (auto &variable : result.variables)
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -38,7 +38,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
  map<Symbol, Symbol> replacements;

  Symbol replace_symbol(const Symbol &symbol) {
-    if (symbol.is_built_in() || symbol.is_token)
+    if (!symbol.is_non_terminal())
      return symbol;

    auto replacement_pair = replacements.find(symbol);
@ -49,7 +49,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
    for (const auto &pair : replacements)
      if (pair.first.index < symbol.index)
        new_index--;
-    return Symbol(new_index);
+    return Symbol(new_index, Symbol::NonTerminal);
  }
 };

@ -60,14 +60,14 @@ class TokenExtractor : public rules::IdentityRuleFn {
    for (size_t i = 0; i < tokens.size(); i++)
      if (tokens[i].rule->operator==(*input)) {
        token_usage_counts[i]++;
-        return make_shared<Symbol>(i, true);
+        return make_shared<Symbol>(i, Symbol::Terminal);
      }

    rule_ptr rule = input->copy();
    size_t index = tokens.size();
    tokens.push_back(Variable(token_description(rule), entry_type, rule));
    token_usage_counts.push_back(1);
-    return make_shared<Symbol>(index, true);
+    return make_shared<Symbol>(index, Symbol::Terminal);
  }

  rule_ptr apply_to(const rules::String *rule) {
@ -90,9 +90,8 @@ class TokenExtractor : public rules::IdentityRuleFn {
  vector<Variable> tokens;
 };

-static CompileError ubiq_token_err(const string &message) {
-  return CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
-                      "Not a token: " + message);
+static CompileError extra_token_error(const string &message) {
+  return CompileError(TSCompileErrorTypeInvalidExtraToken, "Not a token: " + message);
 }

 tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
@ -122,11 +121,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
  size_t i = 0;
  for (const Variable &variable : processed_variables) {
    auto symbol = variable.rule->as<Symbol>();
-    if (symbol && symbol->is_token && !symbol->is_built_in() &&
-        extractor.token_usage_counts[symbol->index] == 1) {
+    if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) {
      lexical_grammar.variables[symbol->index].type = variable.type;
      lexical_grammar.variables[symbol->index].name = variable.name;
-      symbol_replacer.replacements.insert({ Symbol(i), *symbol });
+      symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol });
    } else {
      syntax_grammar.variables.push_back(variable);
    }
@ -158,7 +156,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    bool used_elsewhere_in_grammar = false;
    for (const Variable &variable : lexical_grammar.variables) {
      if (variable.rule->operator==(*rule)) {
-        syntax_grammar.extra_tokens.insert(Symbol(i, true));
+        syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
        used_elsewhere_in_grammar = true;
      }
      i++;
@ -175,17 +173,39 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    auto symbol = rule->as<Symbol>();
    if (!symbol)
      return make_tuple(syntax_grammar, lexical_grammar,
-                        ubiq_token_err(rule->to_string()));
+                        extra_token_error(rule->to_string()));

    Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
-    if (!new_symbol.is_token)
+    if (new_symbol.is_non_terminal()) {
      return make_tuple(
        syntax_grammar, lexical_grammar,
-        ubiq_token_err(syntax_grammar.variables[new_symbol.index].name));
+        extra_token_error(syntax_grammar.variables[new_symbol.index].name));
+    }

    syntax_grammar.extra_tokens.insert(new_symbol);
  }

+  for (const ExternalToken &external_token : grammar.external_tokens) {
+    Symbol internal_token = symbol_replacer.replace_symbol(external_token.corresponding_internal_token);
+
+    if (internal_token.is_non_terminal()) {
+      return make_tuple(
+        syntax_grammar,
+        lexical_grammar,
+        CompileError(
+          TSCompileErrorTypeInvalidExternalToken,
+          "Name '" + external_token.name + "' cannot be used for both an external token and a non-terminal rule"
+        )
+      );
+    }
+
+    syntax_grammar.external_tokens.push_back({
+      external_token.name,
+      external_token.type,
+      internal_token
+    });
+  }
+
  return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());
 }

--- a/src/compiler/prepare_grammar/flatten_grammar.cc
+++ b/src/compiler/prepare_grammar/flatten_grammar.cc
@ -92,6 +92,7 @@ pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &gr
  SyntaxGrammar result;
  result.expected_conflicts = grammar.expected_conflicts;
  result.extra_tokens = grammar.extra_tokens;
+  result.external_tokens = grammar.external_tokens;

  bool is_start = true;
  for (const Variable &variable : grammar.variables) {
--- a/src/compiler/prepare_grammar/initial_syntax_grammar.h
+++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h
@ -1,13 +1,12 @@
 #ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_
 #define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_

-#include <vector>
-#include <string>
 #include <set>
+#include <vector>
 #include "tree_sitter/compiler.h"
 #include "compiler/rules/symbol.h"
-#include "compiler/variable.h"
 #include "compiler/syntax_grammar.h"
+#include "compiler/variable.h"

 namespace tree_sitter {
 namespace prepare_grammar {
@ -16,6 +15,7 @@ struct InitialSyntaxGrammar {
  std::vector<Variable> variables;
  std::set<rules::Symbol> extra_tokens;
  std::set<ConflictSet> expected_conflicts;
+  std::vector<ExternalToken> external_tokens;
 };

 }  // namespace prepare_grammar
--- a/src/compiler/prepare_grammar/intern_symbols.cc
+++ b/src/compiler/prepare_grammar/intern_symbols.cc
@ -8,6 +8,7 @@
 #include "compiler/rules/blank.h"
 #include "compiler/rules/named_symbol.h"
 #include "compiler/rules/symbol.h"
+#include "compiler/rules/built_in_symbols.h"

 namespace tree_sitter {
 namespace prepare_grammar {
@ -17,8 +18,9 @@ using std::vector;
 using std::set;
 using std::pair;
 using std::make_shared;
+using rules::Symbol;

-class InternSymbols : public rules::IdentityRuleFn {
+class SymbolInterner : public rules::IdentityRuleFn {
  using rules::IdentityRuleFn::apply_to;

  rule_ptr apply_to(const rules::NamedSymbol *rule) {
@ -34,11 +36,14 @@ class InternSymbols : public rules::IdentityRuleFn {
  std::shared_ptr<rules::Symbol> symbol_for_rule_name(string rule_name) {
    for (size_t i = 0; i < grammar.rules.size(); i++)
      if (grammar.rules[i].first == rule_name)
-        return make_shared<rules::Symbol>(i);
+        return make_shared<Symbol>(i, Symbol::NonTerminal);
+    for (size_t i = 0; i < grammar.external_tokens.size(); i++)
+      if (grammar.external_tokens[i] == rule_name)
+        return make_shared<rules::Symbol>(i, Symbol::External);
    return nullptr;
  }

-  explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {}
+  explicit SymbolInterner(const Grammar &grammar) : grammar(grammar) {}
  const Grammar grammar;
  string missing_rule_name;
 };
@ -50,16 +55,35 @@ CompileError missing_rule_error(string rule_name) {

 pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
  InternedGrammar result;
-  InternSymbols interner(grammar);
+
+  for (auto &external_token_name : grammar.external_tokens) {
+    Symbol corresponding_internal_token = rules::NONE();
+    for (size_t i = 0, n = grammar.rules.size(); i < n; i++) {
+      if (grammar.rules[i].first == external_token_name) {
+        corresponding_internal_token = Symbol(i, Symbol::NonTerminal);
+        break;
+      }
+    }
+
+    result.external_tokens.push_back(ExternalToken{
+      external_token_name,
+      external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
+      corresponding_internal_token
+    });
+  }
+
+  SymbolInterner interner(grammar);

  for (auto &pair : grammar.rules) {
    auto new_rule = interner.apply(pair.second);
    if (!interner.missing_rule_name.empty())
      return { result, missing_rule_error(interner.missing_rule_name) };

-    result.variables.push_back(Variable(
-      pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
-      new_rule));
+    result.variables.push_back(Variable{
+      pair.first,
+      pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
+      new_rule
+    });
  }

  for (auto &rule : grammar.extra_tokens) {
--- a/src/compiler/prepare_grammar/interned_grammar.h
+++ b/src/compiler/prepare_grammar/interned_grammar.h
@ -15,6 +15,7 @@ struct InternedGrammar {
  std::vector<Variable> variables;
  std::vector<rule_ptr> extra_tokens;
  std::set<ConflictSet> expected_conflicts;
+  std::vector<ExternalToken> external_tokens;
 };

 }  // namespace prepare_grammar
--- a/src/compiler/rules/built_in_symbols.cc
+++ b/src/compiler/rules/built_in_symbols.cc
@ -4,15 +4,15 @@ namespace tree_sitter {
 namespace rules {

 Symbol END_OF_INPUT() {
-  return Symbol(-1, true);
+  return Symbol(-1, Symbol::Terminal);
 }

 Symbol START() {
-  return Symbol(-2);
+  return Symbol(-2, Symbol::NonTerminal);
 }

 Symbol NONE() {
-  return Symbol(-3);
+  return Symbol(-3, Symbol::Type(-1));
 }

 }  // namespace rules
--- a/src/compiler/rules/symbol.cc
+++ b/src/compiler/rules/symbol.cc
@ -11,12 +11,10 @@ using std::string;
 using std::to_string;
 using util::hash_combine;

-Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {}
-
-Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {}
+Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {}

 bool Symbol::operator==(const Symbol &other) const {
-  return (other.index == index) && (other.is_token == is_token);
+  return (other.index == index) && (other.type == type);
 }

 bool Symbol::operator==(const Rule &rule) const {
@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const {
 size_t Symbol::hash_code() const {
  size_t result = 0;
  hash_combine(&result, index);
-  hash_combine(&result, is_token);
+  hash_combine<int>(&result, type);
  return result;
 }

@ -36,14 +34,22 @@ rule_ptr Symbol::copy() const {
 }

 string Symbol::to_string() const {
-  string name = is_token ? "token" : "sym";
-  return "(" + name + " " + std::to_string(index) + ")";
+  switch (type) {
+    case Symbol::Terminal:
+      return "(terminal " + std::to_string(index) + ")";
+    case Symbol::NonTerminal:
+      return "(non-terminal " + std::to_string(index) + ")";
+    case Symbol::External:
+      return "(external " + std::to_string(index) + ")";
+    default:
+      return "(none)";
+  }
 }

 bool Symbol::operator<(const Symbol &other) const {
-  if (is_token && !other.is_token)
+  if (type < other.type)
    return true;
-  if (!is_token && other.is_token)
+  if (other.type < type)
    return false;
  return (index < other.index);
 }
@ -56,6 +62,18 @@ bool Symbol::is_built_in() const {
  return is_built_in(index);
 }

+bool Symbol::is_token() const {
+  return type == Symbol::Terminal;
+}
+
+bool Symbol::is_external() const {
+  return type == Symbol::External;
+}
+
+bool Symbol::is_non_terminal() const {
+  return type == Symbol::NonTerminal;
+}
+
 void Symbol::accept(Visitor *visitor) const {
  visitor->visit(this);
 }
--- a/src/compiler/rules/symbol.h
+++ b/src/compiler/rules/symbol.h
@ -11,9 +11,13 @@ class Symbol : public Rule {
 public:
  typedef int Index;

+  typedef enum {
+    External,
+    Terminal,
+    NonTerminal,
+  } Type;

-  explicit Symbol(Index index);
-  Symbol(Index index, bool is_token);
+  Symbol(Index index, Type type);

  bool operator==(const Symbol &other) const;
  bool operator==(const Rule &other) const;
@ -26,9 +30,12 @@ class Symbol : public Rule {
  bool operator<(const Symbol &other) const;
  static bool is_built_in(Index);
  bool is_built_in() const;
+  bool is_token() const;
+  bool is_external() const;
+  bool is_non_terminal() const;

  Index index;
-  bool is_token;
+  Type type;
 };

 }  // namespace rules
--- a/src/compiler/rules/visitor.h
+++ b/src/compiler/rules/visitor.h
@ -16,6 +16,7 @@ class String;
 class Symbol;
 class Pattern;
 class Metadata;
+class ExternalToken;

 class Visitor {
 public:
@ -29,6 +30,7 @@ class Visitor {
  virtual void visit(const String *rule) = 0;
  virtual void visit(const NamedSymbol *rule) = 0;
  virtual void visit(const Symbol *rule) = 0;
+  virtual void visit(const ExternalToken *rule) = 0;
  virtual ~Visitor();
 };

@ -86,6 +88,10 @@ class RuleFn : private Visitor {
    return default_apply((const Rule *)rule);
  }

+  virtual T apply_to(const ExternalToken *rule) {
+    return default_apply((const Rule *)rule);
+  }
+
  void visit(const Blank *rule) {
    value_ = apply_to(rule);
  }
@ -126,6 +132,10 @@ class RuleFn : private Visitor {
    value_ = apply_to(rule);
  }

+  void visit(const ExternalToken *rule) {
+    value_ = apply_to(rule);
+  }
+
 private:
  T value_;
 };
@ -170,6 +180,9 @@ class RuleFn<void> : private Visitor {
  virtual void apply_to(const Symbol *rule) {
    return default_apply((const Rule *)rule);
  }
+  virtual void apply_to(const ExternalToken *rule) {
+    return default_apply((const Rule *)rule);
+  }

  void visit(const Blank *rule) {
    apply_to(rule);
@ -201,6 +214,9 @@ class RuleFn<void> : private Visitor {
  void visit(const Symbol *rule) {
    apply_to(rule);
  }
+  void visit(const ExternalToken *rule) {
+    apply_to(rule);
+  }
 };

 class IdentityRuleFn : public RuleFn<rule_ptr> {
--- a/src/compiler/syntax_grammar.cc
+++ b/src/compiler/syntax_grammar.cc
@ -13,8 +13,6 @@ using std::pair;
 using std::vector;
 using std::set;

-static const vector<Production> NO_PRODUCTIONS;
-
 SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
                               const vector<Production> &productions)
    : name(name), productions(productions), type(type) {}
@ -23,18 +21,14 @@ ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
                               rules::Associativity associativity)
    : symbol(symbol), precedence(precedence), associativity(associativity) {}

+bool ExternalToken::operator==(const ExternalToken &other) const {
+  return name == other.name && type == other.type &&
+    corresponding_internal_token == other.corresponding_internal_token;
+}
+
 bool ProductionStep::operator==(const ProductionStep &other) const {
  return symbol == other.symbol && precedence == other.precedence &&
         associativity == other.associativity;
 }

-const vector<Production> &SyntaxGrammar::productions(
-  const rules::Symbol &symbol) const {
-  if (symbol.is_built_in() || symbol.is_token) {
-    return NO_PRODUCTIONS;
-  } else {
-    return variables[symbol.index].productions;
-  }
-}
-
 }  // namespace tree_sitter
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@ -10,6 +10,14 @@

 namespace tree_sitter {

+struct ExternalToken {
+  std::string name;
+  VariableType type;
+  rules::Symbol corresponding_internal_token;
+
+  bool operator==(const ExternalToken &) const;
+};
+
 struct ProductionStep {
  ProductionStep(const rules::Symbol &, int, rules::Associativity);
  bool operator==(const ProductionStep &) const;
@ -33,11 +41,10 @@ struct SyntaxVariable {
 typedef std::set<rules::Symbol> ConflictSet;

 struct SyntaxGrammar {
-  const std::vector<Production> &productions(const rules::Symbol &) const;
-
  std::vector<SyntaxVariable> variables;
  std::set<rules::Symbol> extra_tokens;
  std::set<ConflictSet> expected_conflicts;
+  std::vector<ExternalToken> external_tokens;
 };

 }  // namespace tree_sitter
--- a/src/runtime/document.c
+++ b/src/runtime/document.c
@ -36,8 +36,9 @@ const TSLanguage *ts_document_language(TSDocument *self) {
 }

 void ts_document_set_language(TSDocument *self, const TSLanguage *language) {
+  if (language->version != TREE_SITTER_LANGUAGE_VERSION) return;
  ts_document_invalidate(self);
-  self->parser.language = language;
+  parser_set_language(&self->parser, language);
  if (self->tree) {
    ts_tree_release(self->tree);
    self->tree = NULL;
--- a/src/runtime/language.c
+++ b/src/runtime/language.c
@ -34,6 +34,10 @@ uint32_t ts_language_symbol_count(const TSLanguage *language) {
  return language->symbol_count;
 }

+uint32_t ts_language_version(const TSLanguage *language) {
+  return language->version;
+}
+
 TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language,
                                             TSSymbol symbol) {
  if (symbol == ts_builtin_sym_error)
--- a/src/runtime/language.h
+++ b/src/runtime/language.h
@ -19,6 +19,10 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry

 TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);

+static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
+  return 0 < symbol && symbol < self->external_token_count + 1;
+}
+
 static inline const TSParseAction *ts_language_actions(const TSLanguage *self,
                                                       TSStateId state,
                                                       TSSymbol symbol,
@ -49,6 +53,16 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self,
  }
 }

+static inline const bool *
+ts_language_enabled_external_tokens(const TSLanguage *self,
+                                    unsigned external_scanner_state) {
+  if (external_scanner_state == 0) {
+    return NULL;
+  } else {
+    return self->external_scanner.states + self->external_token_count * external_scanner_state;
+  }
+}
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/runtime/length.h
+++ b/src/runtime/length.h
@ -21,12 +21,11 @@ static inline void length_set_unknown_chars(Length *self) {
 }

 static inline Length length_min(Length len1, Length len2) {
-  return (len1.chars < len2.chars) ? len1 : len2;
+  return (len1.bytes < len2.bytes) ? len1 : len2;
 }

 static inline Length length_add(Length len1, Length len2) {
  Length result;
-  result.chars = len1.chars + len2.chars;
  result.bytes = len1.bytes + len2.bytes;
  result.extent = point_add(len1.extent, len2.extent);

@ -57,10 +56,4 @@ static inline Length length_zero() {
  return (Length){ 0, 0, {0, 0} };
 }

-static inline bool length_eq(Length self, Length other) {
-  return self.bytes == other.bytes && self.chars == other.chars &&
-         self.extent.row == other.extent.row &&
-         self.extent.column == other.extent.column;
-}
-
 #endif
--- a/src/runtime/lexer.c
+++ b/src/runtime/lexer.c
@ -11,11 +11,8 @@
    self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
  }

-#define LOG_LOOKAHEAD()                                            \
-  LOG((0 < self->data.lookahead && self->data.lookahead < 256) \
-        ? "lookahead char:'%c'"                                    \
-        : "lookahead char:%d",                                     \
-      self->data.lookahead);
+#define LOG_CHARACTER(message, character) \
+  LOG(character < 255 ? message " character:'%c'" : message " character:%d", character)

 static const char empty_chunk[2] = { 0, 0 };

@ -42,11 +39,9 @@ static void ts_lexer__get_lookahead(Lexer *self) {
      utf8proc_iterate(chunk, size, &self->data.lookahead);
  else
    self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead);
-
-  LOG_LOOKAHEAD();
 }

-static void ts_lexer__advance(void *payload, TSStateId state, bool skip) {
+static void ts_lexer__advance(void *payload, bool skip) {
  Lexer *self = (Lexer *)payload;
  if (self->chunk == empty_chunk)
    return;
@ -63,10 +58,10 @@ static void ts_lexer__advance(void *payload, TSStateId state, bool skip) {
  }

  if (skip) {
-    LOG("skip_separator state:%d", state);
+    LOG_CHARACTER("skip", self->data.lookahead);
    self->token_start_position = self->current_position;
  } else {
-    LOG("advance state:%d", state);
+    LOG_CHARACTER("consume", self->data.lookahead);
  }

  if (self->current_position.bytes >= self->chunk_start + self->chunk_size)
@ -93,6 +88,7 @@ void ts_lexer_init(Lexer *self) {
      .payload = NULL,
      .log = NULL
    },
+    .last_external_token_state = NULL,
  };
  ts_lexer_reset(self, length_zero());
 }
@ -115,17 +111,16 @@ static inline void ts_lexer__reset(Lexer *self, Length position) {
 void ts_lexer_set_input(Lexer *self, TSInput input) {
  self->input = input;
  ts_lexer__reset(self, length_zero());
+  self->last_external_token_state = NULL;
 }

 void ts_lexer_reset(Lexer *self, Length position) {
-  if (!length_eq(position, self->current_position))
+  if (position.bytes != self->current_position.bytes) {
    ts_lexer__reset(self, position);
-  return;
+  }
 }

-void ts_lexer_start(Lexer *self, TSStateId lex_state) {
-  LOG("start_lex state:%d, pos:%u", lex_state, self->current_position.chars);
-
+void ts_lexer_start(Lexer *self) {
  self->token_start_position = self->current_position;
  self->data.result_symbol = 0;

--- a/src/runtime/lexer.h
+++ b/src/runtime/lexer.h
@ -25,12 +25,13 @@ typedef struct {
  TSInput input;
  TSLogger logger;
  char debug_buffer[TS_DEBUG_BUFFER_SIZE];
+  const TSExternalTokenState *last_external_token_state;
 } Lexer;

 void ts_lexer_init(Lexer *);
 void ts_lexer_set_input(Lexer *, TSInput);
 void ts_lexer_reset(Lexer *, Length);
-void ts_lexer_start(Lexer *, TSStateId);
+void ts_lexer_start(Lexer *);

 #ifdef __cplusplus
 }
--- a/src/runtime/node.c
+++ b/src/runtime/node.c
@ -39,7 +39,15 @@ static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) {
 static inline uint32_t ts_node__relevant_child_count(TSNode self,
                                                   bool include_anonymous) {
  const Tree *tree = ts_node__tree(self);
-  return include_anonymous ? tree->visible_child_count : tree->named_child_count;
+  if (tree->child_count > 0) {
+    if (include_anonymous) {
+      return tree->visible_child_count;
+    } else {
+      return tree->named_child_count;
+    }
+  } else {
+    return 0;
+  }
 }

 static inline TSNode ts_node__direct_parent(TSNode self, uint32_t *index) {
@ -324,11 +332,21 @@ TSNode ts_node_named_child(TSNode self, uint32_t child_index) {
 }

 uint32_t ts_node_child_count(TSNode self) {
-  return ts_node__tree(self)->visible_child_count;
+  const Tree *tree = ts_node__tree(self);
+  if (tree->child_count > 0) {
+    return tree->visible_child_count;
+  } else {
+    return 0;
+  }
 }

 uint32_t ts_node_named_child_count(TSNode self) {
-  return ts_node__tree(self)->named_child_count;
+  const Tree *tree = ts_node__tree(self);
+  if (tree->child_count > 0) {
+    return tree->named_child_count;
+  } else {
+    return 0;
+  }
 }

 TSNode ts_node_next_sibling(TSNode self) {
--- a/src/runtime/parser.c
+++ b/src/runtime/parser.c
@ -109,28 +109,6 @@ static bool parser__breakdown_top_of_stack(Parser *self, StackVersion version) {
  return did_break_down;
 }

-static void parser__pop_reusable_node(ReusableNode *reusable_node) {
-  reusable_node->byte_index += ts_tree_total_bytes(reusable_node->tree);
-  while (reusable_node->tree) {
-    Tree *parent = reusable_node->tree->context.parent;
-    uint32_t next_index = reusable_node->tree->context.index + 1;
-    if (parent && parent->child_count > next_index) {
-      reusable_node->tree = parent->children[next_index];
-      return;
-    }
-    reusable_node->tree = parent;
-  }
-}
-
-static bool parser__breakdown_reusable_node(ReusableNode *reusable_node) {
-  if (reusable_node->tree->child_count == 0) {
-    return false;
-  } else {
-    reusable_node->tree = reusable_node->tree->children[0];
-    return true;
-  }
-}
-
 static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead,
                                        TSStateId state,
                                        ReusableNode *reusable_node) {
@ -140,12 +118,11 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead,
          reusable_node->tree->fragile_left ||
          reusable_node->tree->fragile_right)) {
    LOG("state_mismatch sym:%s", SYM_NAME(reusable_node->tree->symbol));
-    parser__breakdown_reusable_node(reusable_node);
+    reusable_node_breakdown(reusable_node);
    result = true;
  }

  if (result) {
-    LOG("lookahead sym:%s", SYM_NAME(reusable_node->tree->symbol));
    ts_tree_release(*lookahead);
    ts_tree_retain(*lookahead = reusable_node->tree);
  }
@ -153,16 +130,20 @@ static bool parser__breakdown_lookahead(Parser *self, Tree **lookahead,
  return result;
 }

-static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) {
-  while (reusable_node->tree->child_count > 0)
-    reusable_node->tree = reusable_node->tree->children[0];
-  parser__pop_reusable_node(reusable_node);
+static inline bool ts_lex_mode_eq(TSLexMode self, TSLexMode other) {
+  return self.lex_state == other.lex_state &&
+    self.external_lex_state == other.external_lex_state;
 }

 static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree,
                              TableEntry *table_entry) {
-  if (tree->first_leaf.lex_state == self->language->lex_states[state])
+  TSLexMode current_lex_mode = self->language->lex_modes[state];
+  if (ts_lex_mode_eq(tree->first_leaf.lex_mode, current_lex_mode))
    return true;
+  if (current_lex_mode.external_lex_state != 0)
+    return false;
+  if (tree->size.bytes == 0)
+    return false;
  if (!table_entry->is_reusable)
    return false;
  if (!table_entry->depends_on_lookahead)
@ -208,28 +189,76 @@ static bool parser__condense_stack(Parser *self) {
  return result;
 }

-static Tree *parser__lex(Parser *self, TSStateId parse_state) {
-  TSStateId start_state = self->language->lex_states[parse_state];
-  TSStateId current_state = start_state;
-  Length start_position = self->lexer.current_position;
-  LOG("lex state:%d", start_state);
+static void parser__restore_external_scanner(Parser *self, StackVersion version) {
+  const TSExternalTokenState *state = ts_stack_external_token_state(self->stack, version);
+  if (self->lexer.last_external_token_state != state) {
+    LOG("restore_external_scanner");
+    self->lexer.last_external_token_state = state;
+    if (state) {
+      self->language->external_scanner.deserialize(
+        self->external_scanner_payload,
+        *state
+      );
+    } else {
+      self->language->external_scanner.reset(self->external_scanner_payload);
+    }
+  }
+}

+static Tree *parser__lex(Parser *self, StackVersion version) {
+  TSStateId parse_state = ts_stack_top_state(self->stack, version);
+  Length start_position = ts_stack_top_position(self->stack, version);
+  TSLexMode lex_mode = self->language->lex_modes[parse_state];
+  const bool *valid_external_tokens = ts_language_enabled_external_tokens(
+    self->language,
+    lex_mode.external_lex_state
+  );
+
+  bool found_external_token = false;
+  bool found_error = false;
  bool skipped_error = false;
  int32_t first_error_character = 0;
  Length error_start_position, error_end_position;
+  ts_lexer_reset(&self->lexer, start_position);

-  ts_lexer_start(&self->lexer, start_state);
+  for (;;) {
+    Length current_position = self->lexer.current_position;

-  while (!self->language->lex_fn(&self->lexer.data, current_state)) {
-    if (current_state != ERROR_STATE) {
+    if (valid_external_tokens) {
+      LOG("lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state,
+          current_position.extent.row, current_position.extent.column);
+      parser__restore_external_scanner(self, version);
+      ts_lexer_start(&self->lexer);
+      if (self->language->external_scanner.scan(self->external_scanner_payload,
+                                                &self->lexer.data, valid_external_tokens)) {
+        found_external_token = true;
+        break;
+      }
+      ts_lexer_reset(&self->lexer, current_position);
+    }
+
+    LOG("lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state,
+        current_position.extent.row, current_position.extent.column);
+    ts_lexer_start(&self->lexer);
+    if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) {
+      break;
+    }
+
+    if (!found_error) {
      LOG("retry_in_error_mode");
-      current_state = ERROR_STATE;
+      found_error = true;
+      lex_mode = self->language->lex_modes[ERROR_STATE];
+      valid_external_tokens = ts_language_enabled_external_tokens(
+        self->language,
+        lex_mode.external_lex_state
+      );
      ts_lexer_reset(&self->lexer, start_position);
-      ts_lexer_start(&self->lexer, current_state);
      continue;
    }

    if (!skipped_error) {
+      LOG("skip_unrecognized_character");
+      skipped_error = true;
      error_start_position = self->lexer.token_start_position;
      first_error_character = self->lexer.data.lookahead;
    }
@ -239,15 +268,13 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) {
        self->lexer.data.result_symbol = ts_builtin_sym_error;
        break;
      }
-      self->lexer.data.advance(&self->lexer, ERROR_STATE, false);
+      self->lexer.data.advance(&self->lexer, false);
    }

-    skipped_error = true;
    error_end_position = self->lexer.current_position;
  }

  Tree *result;
-
  if (skipped_error) {
    Length padding = length_sub(error_start_position, start_position);
    Length size = length_sub(error_end_position, error_start_position);
@ -255,20 +282,28 @@ static Tree *parser__lex(Parser *self, TSStateId parse_state) {
    result = ts_tree_make_error(size, padding, first_error_character);
  } else {
    TSSymbol symbol = self->lexer.data.result_symbol;
-    Length padding =
-      length_sub(self->lexer.token_start_position, start_position);
-    Length size = length_sub(self->lexer.current_position,
-                                  self->lexer.token_start_position);
-    result =
-      ts_tree_make_leaf(symbol, padding, size,
-                        ts_language_symbol_metadata(self->language, symbol));
+    if (found_external_token) {
+      symbol = self->language->external_scanner.symbol_map[symbol];
+    }
+
+    Length padding = length_sub(self->lexer.token_start_position, start_position);
+    Length size = length_sub(self->lexer.current_position, self->lexer.token_start_position);
+    TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, symbol);
+    result = ts_tree_make_leaf(symbol, padding, size, metadata);
+
+    if (found_external_token) {
+      result->has_external_tokens = true;
+      result->has_external_token_state = true;
+      memset(result->external_token_state, 0, sizeof(TSExternalTokenState));
+      self->language->external_scanner.serialize(self->external_scanner_payload, result->external_token_state);
+      self->lexer.last_external_token_state = &result->external_token_state;
+    }
  }

-  if (!result)
-    return NULL;
-
  result->parse_state = parse_state;
-  result->first_leaf.lex_state = start_state;
+  result->first_leaf.lex_mode = lex_mode;
+
+  LOG("lexed_lookahead sym:%s, size:%u", SYM_NAME(result->symbol), result->size.bytes);
  return result;
 }

@ -277,21 +312,31 @@ static void parser__clear_cached_token(Parser *self) {
  self->cached_token = NULL;
 }

+static inline bool ts_external_token_state_eq(const TSExternalTokenState *self,
+                                              const TSExternalTokenState *other) {
+  if (self == other) {
+    return true;
+  } else if (!self || !other) {
+    return false;
+  } else {
+    return memcmp(self, other, sizeof(TSExternalTokenState)) == 0;
+  }
+}
+
 static Tree *parser__get_lookahead(Parser *self, StackVersion version,
-                                     ReusableNode *reusable_node) {
+                                   ReusableNode *reusable_node,
+                                   bool *is_fresh) {
  Length position = ts_stack_top_position(self->stack, version);

  while (reusable_node->tree) {
    if (reusable_node->byte_index > position.bytes) {
-      LOG("before_reusable sym:%s, pos:%u",
-          SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index);
+      LOG("before_reusable_node sym:%s", SYM_NAME(reusable_node->tree->symbol));
      break;
    }

    if (reusable_node->byte_index < position.bytes) {
-      LOG("past_reusable sym:%s, pos:%u",
-          SYM_NAME(reusable_node->tree->symbol), reusable_node->byte_index);
-      parser__pop_reusable_node(reusable_node);
+      LOG("past_reusable sym:%s", SYM_NAME(reusable_node->tree->symbol));
+      reusable_node_pop(reusable_node);
      continue;
    }

@ -299,8 +344,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version,
      LOG("cant_reuse_changed tree:%s, size:%u",
          SYM_NAME(reusable_node->tree->symbol),
          reusable_node->tree->size.bytes);
-      if (!parser__breakdown_reusable_node(reusable_node)) {
-        parser__pop_reusable_node(reusable_node);
+      if (!reusable_node_breakdown(reusable_node)) {
+        reusable_node_pop(reusable_node);
        parser__breakdown_top_of_stack(self, version);
      }
      continue;
@ -310,8 +355,21 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version,
      LOG("cant_reuse_error tree:%s, size:%u",
          SYM_NAME(reusable_node->tree->symbol),
          reusable_node->tree->size.bytes);
-      if (!parser__breakdown_reusable_node(reusable_node)) {
-        parser__pop_reusable_node(reusable_node);
+      if (!reusable_node_breakdown(reusable_node)) {
+        reusable_node_pop(reusable_node);
+        parser__breakdown_top_of_stack(self, version);
+      }
+      continue;
+    }
+
+    if (!ts_external_token_state_eq(
+          reusable_node->preceding_external_token_state,
+          ts_stack_external_token_state(self->stack, version))) {
+      LOG("cant_reuse_external_tokens tree:%s, size:%u",
+          SYM_NAME(reusable_node->tree->symbol),
+          reusable_node->tree->size.bytes);
+      if (!reusable_node_breakdown(reusable_node)) {
+        reusable_node_pop(reusable_node);
        parser__breakdown_top_of_stack(self, version);
      }
      continue;
@ -327,9 +385,8 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version,
    return self->cached_token;
  }

-  ts_lexer_reset(&self->lexer, position);
-  TSStateId parse_state = ts_stack_top_state(self->stack, version);
-  return parser__lex(self, parse_state);
+  *is_fresh = true;
+  return parser__lex(self, version);
 }

 static bool parser__select_tree(Parser *self, Tree *left, Tree *right) {
@ -407,6 +464,10 @@ static void parser__shift(Parser *self, StackVersion version, TSStateId state,

  bool is_pending = lookahead->child_count > 0;
  ts_stack_push(self->stack, version, lookahead, is_pending, state);
+  if (lookahead->has_external_token_state) {
+    ts_stack_set_external_token_state(
+      self->stack, version, ts_tree_last_external_token_state(lookahead));
+  }
  ts_tree_release(lookahead);
 }

@ -729,9 +790,13 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) {
    LOG("new_parse");
  }

+  if (self->language->external_scanner.reset) {
+    self->language->external_scanner.reset(self->external_scanner_payload);
+  }
+
  ts_lexer_set_input(&self->lexer, input);
  ts_stack_clear(self->stack);
-  self->reusable_node = (ReusableNode){ previous_tree, 0 };
+  self->reusable_node = reusable_node_new(previous_tree);
  self->cached_token = NULL;
  self->finished_tree = NULL;
 }
@ -950,30 +1015,29 @@ static void parser__recover(Parser *self, StackVersion version, TSStateId state,
 static void parser__advance(Parser *self, StackVersion version,
                            ReusableNode *reusable_node) {
  bool validated_lookahead = false;
-  Tree *lookahead = parser__get_lookahead(self, version, reusable_node);
+  Tree *lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead);

  for (;;) {
    TSStateId state = ts_stack_top_state(self->stack, version);

    TableEntry table_entry;
-    ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol,
-                            &table_entry);
+    ts_language_table_entry(self->language, state, lookahead->first_leaf.symbol, &table_entry);

    if (!validated_lookahead) {
      if (!parser__can_reuse(self, state, lookahead, &table_entry)) {
-        if (lookahead == reusable_node->tree)
-          parser__pop_reusable_node_leaf(reusable_node);
-        else
+        if (lookahead == reusable_node->tree) {
+          reusable_node_pop_leaf(reusable_node);
+        } else {
          parser__clear_cached_token(self);
+        }

        ts_tree_release(lookahead);
-        lookahead = parser__get_lookahead(self, version, reusable_node);
+        lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead);
        continue;
      }

      validated_lookahead = true;
-      LOG("lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol),
-          lookahead->size.bytes);
+      LOG("reused_lookahead sym:%s, size:%u", SYM_NAME(lookahead->symbol), lookahead->size.bytes);
    }

    bool reduction_stopped_at_error = false;
@ -996,12 +1060,11 @@ static void parser__advance(Parser *self, StackVersion version,
          }

          if (lookahead->child_count > 0) {
-            if (parser__breakdown_lookahead(self, &lookahead, state,
-                                            reusable_node)) {
+            if (parser__breakdown_lookahead(self, &lookahead, state, reusable_node)) {
              if (!parser__can_reuse(self, state, lookahead, &table_entry)) {
-                parser__pop_reusable_node(reusable_node);
+                reusable_node_pop(reusable_node);
                ts_tree_release(lookahead);
-                lookahead = parser__get_lookahead(self, version, reusable_node);
+                lookahead = parser__get_lookahead(self, version, reusable_node, &validated_lookahead);
              }
            }

@ -1011,7 +1074,7 @@ static void parser__advance(Parser *self, StackVersion version,
          parser__shift(self, version, next_state, lookahead, extra);

          if (lookahead == reusable_node->tree)
-            parser__pop_reusable_node(reusable_node);
+            reusable_node_pop(reusable_node);

          ts_tree_release(lookahead);
          return;
@ -1053,7 +1116,7 @@ static void parser__advance(Parser *self, StackVersion version,

        case TSParseActionTypeRecover: {
          while (lookahead->child_count > 0) {
-            parser__breakdown_reusable_node(reusable_node);
+            reusable_node_breakdown(reusable_node);
            ts_tree_release(lookahead);
            lookahead = reusable_node->tree;
            ts_tree_retain(lookahead);
@ -1061,7 +1124,7 @@ static void parser__advance(Parser *self, StackVersion version,

          parser__recover(self, version, action.params.to_state, lookahead);
          if (lookahead == reusable_node->tree)
-            parser__pop_reusable_node(reusable_node);
+            reusable_node_pop(reusable_node);
          ts_tree_release(lookahead);
          return;
        }
@ -1103,6 +1166,18 @@ bool parser_init(Parser *self) {
  return true;
 }

+void parser_set_language(Parser *self, const TSLanguage *language) {
+  if (self->external_scanner_payload && self->language->external_scanner.destroy)
+    self->language->external_scanner.destroy(self->external_scanner_payload);
+
+  if (language && language->external_scanner.create)
+    self->external_scanner_payload = language->external_scanner.create();
+  else
+    self->external_scanner_payload = NULL;
+
+  self->language = language;
+}
+
 void parser_destroy(Parser *self) {
  if (self->stack)
    ts_stack_delete(self->stack);
@ -1112,6 +1187,7 @@ void parser_destroy(Parser *self) {
    array_delete(&self->tree_path1);
  if (self->tree_path2.contents)
    array_delete(&self->tree_path2);
+  parser_set_language(self, NULL);
 }

 Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) {
@ -1128,15 +1204,14 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree) {

      while (!ts_stack_is_halted(self->stack, version)) {
        position = ts_stack_top_position(self->stack, version).chars;
-        if (position > last_position ||
-            (version > 0 && position == last_position))
+        if (position > last_position || (version > 0 && position == last_position))
          break;

        LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u",
            version, ts_stack_version_count(self->stack),
            ts_stack_top_state(self->stack, version),
-            ts_stack_top_position(self->stack, version).extent.row + 1,
-            ts_stack_top_position(self->stack, version).extent.column + 1);
+            ts_stack_top_position(self->stack, version).extent.row,
+            ts_stack_top_position(self->stack, version).extent.column);

        parser__advance(self, version, &reusable_node);
        LOG_STACK();
--- a/src/runtime/parser.h
+++ b/src/runtime/parser.h
@ -8,13 +8,9 @@ extern "C" {
 #include "runtime/stack.h"
 #include "runtime/array.h"
 #include "runtime/lexer.h"
+#include "runtime/reusable_node.h"
 #include "runtime/reduce_action.h"

-typedef struct {
-  Tree *tree;
-  uint32_t byte_index;
-} ReusableNode;
-
 typedef struct {
  Lexer lexer;
  Stack *stack;
@ -29,11 +25,14 @@ typedef struct {
  ReusableNode reusable_node;
  TreePath tree_path1;
  TreePath tree_path2;
+  void *external_scanner_payload;
+  Tree *last_external_token;
 } Parser;

 bool parser_init(Parser *);
 void parser_destroy(Parser *);
 Tree *parser_parse(Parser *, TSInput, Tree *);
+void parser_set_language(Parser *, const TSLanguage *);

 #ifdef __cplusplus
 }
--- a/src/runtime/reusable_node.h
+++ b/src/runtime/reusable_node.h
@ -0,0 +1,50 @@
+#include "runtime/tree.h"
+
+typedef struct {
+  Tree *tree;
+  uint32_t byte_index;
+  bool has_preceding_external_token;
+  const TSExternalTokenState *preceding_external_token_state;
+} ReusableNode;
+
+static inline ReusableNode reusable_node_new(Tree *tree) {
+  return (ReusableNode){
+    .tree = tree,
+    .byte_index = 0,
+    .has_preceding_external_token = false,
+    .preceding_external_token_state = NULL,
+  };
+}
+
+static inline void reusable_node_pop(ReusableNode *self) {
+  self->byte_index += ts_tree_total_bytes(self->tree);
+  if (self->tree->has_external_tokens) {
+    self->has_preceding_external_token = true;
+    self->preceding_external_token_state = ts_tree_last_external_token_state(self->tree);
+  }
+
+  while (self->tree) {
+    Tree *parent = self->tree->context.parent;
+    uint32_t next_index = self->tree->context.index + 1;
+    if (parent && parent->child_count > next_index) {
+      self->tree = parent->children[next_index];
+      return;
+    }
+    self->tree = parent;
+  }
+}
+
+static inline void reusable_node_pop_leaf(ReusableNode *self) {
+  while (self->tree->child_count > 0)
+    self->tree = self->tree->children[0];
+  reusable_node_pop(self);
+}
+
+static inline bool reusable_node_breakdown(ReusableNode *self) {
+  if (self->tree->child_count == 0) {
+    return false;
+  } else {
+    self->tree = self->tree->children[0];
+    return true;
+  }
+}
--- a/src/runtime/stack.c
+++ b/src/runtime/stack.c
@ -50,6 +50,7 @@ typedef struct {
  StackNode *node;
  bool is_halted;
  unsigned push_count;
+  const TSExternalTokenState *external_token_state;
 } StackHead;

 struct Stack {
@ -168,11 +169,13 @@ static void stack_node_add_link(StackNode *self, StackLink link) {
 }

 static StackVersion ts_stack__add_version(Stack *self, StackNode *node,
-                                          unsigned push_count) {
+                                          unsigned push_count,
+                                          const TSExternalTokenState *external_token_state) {
  StackHead head = {
    .node = node,
    .is_halted = false,
    .push_count = push_count,
+    .external_token_state = external_token_state,
  };
  array_push(&self->heads, head);
  stack_node_retain(node);
@ -180,7 +183,8 @@ static StackVersion ts_stack__add_version(Stack *self, StackNode *node,
 }

 static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees,
-                                unsigned push_count) {
+                                unsigned push_count,
+                                const TSExternalTokenState *external_token_state) {
  for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) {
    StackVersion version = self->slices.contents[i].version;
    if (self->heads.contents[version].node == node) {
@ -190,7 +194,7 @@ static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees,
    }
  }

-  StackVersion version = ts_stack__add_version(self, node, push_count);
+  StackVersion version = ts_stack__add_version(self, node, push_count, external_token_state);
  StackSlice slice = { *trees, version };
  array_push(&self->slices, slice);
 }
@ -202,6 +206,7 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version,

  StackHead *head = array_get(&self->heads, version);
  unsigned push_count = head->push_count;
+  const TSExternalTokenState *external_token_state = head->external_token_state;
  Iterator iterator = {
    .node = head->node,
    .trees = array_new(),
@ -229,7 +234,8 @@ INLINE StackPopResult stack__iter(Stack *self, StackVersion version,
        if (!should_stop)
          ts_tree_array_copy(trees, &trees);
        array_reverse(&trees);
-        ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count);
+        ts_stack__add_slice(self, node, &trees, push_count + iterator->push_count,
+                            external_token_state);
      }

      if (should_stop) {
@ -288,7 +294,12 @@ Stack *ts_stack_new() {
  self->base_node =
    stack_node_new(NULL, NULL, false, 1, length_zero(), &self->node_pool);
  stack_node_retain(self->base_node);
-  array_push(&self->heads, ((StackHead){ self->base_node, false, 0 }));
+  array_push(&self->heads, ((StackHead){
+    self->base_node,
+    false,
+    0,
+    NULL
+  }));

  return self;
 }
@ -327,11 +338,19 @@ unsigned ts_stack_push_count(const Stack *self, StackVersion version) {
  return array_get(&self->heads, version)->push_count;
 }

-void ts_stack_decrease_push_count(const Stack *self, StackVersion version,
+void ts_stack_decrease_push_count(Stack *self, StackVersion version,
                                  unsigned decrement) {
  array_get(&self->heads, version)->push_count -= decrement;
 }

+const TSExternalTokenState *ts_stack_external_token_state(const Stack *self, StackVersion version) {
+  return array_get(&self->heads, version)->external_token_state;
+}
+
+void ts_stack_set_external_token_state(Stack *self, StackVersion version, const TSExternalTokenState *state) {
+  array_get(&self->heads, version)->external_token_state = state;
+}
+
 ErrorStatus ts_stack_error_status(const Stack *self, StackVersion version) {
  StackHead *head = array_get(&self->heads, version);
  return (ErrorStatus){
@ -480,7 +499,8 @@ bool ts_stack_merge(Stack *self, StackVersion version, StackVersion new_version)
  if (new_node->state == node->state &&
      new_node->position.chars == node->position.chars &&
      new_node->error_count == node->error_count &&
-      new_node->error_cost == node->error_cost) {
+      new_node->error_cost == node->error_cost &&
+      new_head->external_token_state == head->external_token_state) {
    for (uint32_t j = 0; j < new_node->link_count; j++)
      stack_node_add_link(node, new_node->links[j]);
    if (new_head->push_count > head->push_count)
@ -505,7 +525,12 @@ void ts_stack_clear(Stack *self) {
  for (uint32_t i = 0; i < self->heads.size; i++)
    stack_node_release(self->heads.contents[i].node, &self->node_pool);
  array_clear(&self->heads);
-  array_push(&self->heads, ((StackHead){ self->base_node, false, 0 }));
+  array_push(&self->heads, ((StackHead){
+    self->base_node,
+    false,
+    0,
+    NULL
+  }));
 }

 bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) {
@ -528,8 +553,20 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) {
    fprintf(
      f,
      "node_head_%u -> node_%p [label=%u, fontcolor=blue, weight=10000, "
-      "labeltooltip=\"push_count: %u\"]\n",
+      "labeltooltip=\"push_count: %u",
      i, head->node, i, head->push_count);
+
+    if (head->external_token_state) {
+      const TSExternalTokenState *s = head->external_token_state;
+      fprintf(f,
+        "\nexternal_token_state: "
+        "%2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X",
+        (*s)[0], (*s)[1], (*s)[2], (*s)[3], (*s)[4], (*s)[5], (*s)[6], (*s)[7],
+        (*s)[8], (*s)[9], (*s)[10], (*s)[11], (*s)[12], (*s)[13], (*s)[14], (*s)[15]
+      );
+    }
+
+    fprintf(f, "\"]\n");
    array_push(&self->iterators, ((Iterator){.node = head->node }));
  }

--- a/src/runtime/stack.h
+++ b/src/runtime/stack.h
@ -65,7 +65,11 @@ TSStateId ts_stack_top_state(const Stack *, StackVersion);

 unsigned ts_stack_push_count(const Stack *, StackVersion);

-void ts_stack_decrease_push_count(const Stack *, StackVersion, unsigned);
+void ts_stack_decrease_push_count(Stack *, StackVersion, unsigned);
+
+const TSExternalTokenState *ts_stack_external_token_state(const Stack *, StackVersion);
+
+void ts_stack_set_external_token_state(Stack *, StackVersion, const TSExternalTokenState *);

 /*
 *  Get the position at the top of the given version of the stack. If the stack
--- a/src/runtime/tree.c
+++ b/src/runtime/tree.c
@ -25,10 +25,7 @@ Tree *ts_tree_make_leaf(TSSymbol sym, Length padding, Length size,
    .visible = metadata.visible,
    .named = metadata.named,
    .has_changes = false,
-    .first_leaf = {
-      .symbol = sym,
-      .lex_state = 0
-    }
+    .first_leaf.symbol = sym,
  };
  return result;
 }
@ -111,6 +108,8 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) {
  self->named_child_count = 0;
  self->visible_child_count = 0;
  self->error_cost = 0;
+  self->has_external_tokens = false;
+  self->has_external_token_state = false;

  for (uint32_t i = 0; i < child_count; i++) {
    Tree *child = children[i];
@ -128,11 +127,14 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) {
      self->visible_child_count++;
      if (child->named)
        self->named_child_count++;
-    } else {
+    } else if (child->child_count > 0) {
      self->visible_child_count += child->visible_child_count;
      self->named_child_count += child->named_child_count;
    }

+    if (child->has_external_tokens) self->has_external_tokens = true;
+    if (child->has_external_token_state) self->has_external_token_state = true;
+
    if (child->symbol == ts_builtin_sym_error) {
      self->fragile_left = self->fragile_right = true;
      self->parse_state = TS_TREE_STATE_NONE;
@ -377,6 +379,21 @@ void ts_tree_edit(Tree *self, const TSInputEdit *edit) {
  }
 }

+const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *tree) {
+  while (tree->child_count > 0) {
+    for (uint32_t i = tree->child_count - 1; i + 1 > 0; i--) {
+      Tree *child = tree->children[i];
+      if (child->has_external_token_state) {
+        tree = child;
+        break;
+      } else if (child->has_external_tokens) {
+        return NULL;
+      }
+    }
+  }
+  return &tree->external_token_state;
+}
+
 static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) {
  if (c == 0)
    return snprintf(s, n, "EOF");
--- a/src/runtime/tree.h
+++ b/src/runtime/tree.h
@ -22,10 +22,13 @@ typedef struct Tree {
  } context;

  uint32_t child_count;
-  uint32_t visible_child_count;
-  uint32_t named_child_count;
  union {
-    struct Tree **children;
+    struct {
+      uint32_t visible_child_count;
+      uint32_t named_child_count;
+      struct Tree **children;
+    };
+    TSExternalTokenState external_token_state;
    int32_t lookahead_char;
  };

@ -38,7 +41,7 @@ typedef struct Tree {

  struct {
    TSSymbol symbol;
-    TSStateId lex_state;
+    TSLexMode lex_mode;
  } first_leaf;

  unsigned short ref_count;
@ -48,6 +51,8 @@ typedef struct Tree {
  bool fragile_left : 1;
  bool fragile_right : 1;
  bool has_changes : 1;
+  bool has_external_tokens : 1;
+  bool has_external_token_state : 1;
 } Tree;

 typedef struct {
@ -81,6 +86,7 @@ void ts_tree_assign_parents(Tree *, TreePath *);
 void ts_tree_edit(Tree *, const TSInputEdit *edit);
 char *ts_tree_string(const Tree *, const TSLanguage *, bool include_all);
 void ts_tree_print_dot_graph(const Tree *, const TSLanguage *, FILE *);
+const TSExternalTokenState *ts_tree_last_external_token_state(const Tree *);

 static inline uint32_t ts_tree_total_bytes(const Tree *self) {
  return self->padding.bytes + self->size.bytes;
--- a/src/runtime/tree_path.h
+++ b/src/runtime/tree_path.h
@ -21,20 +21,20 @@ static void range_array_add(RangeArray *results, TSPoint start, TSPoint end) {
  }
 }

-static bool tree_path_descend(TreePath *path, TSPoint position) {
+static bool tree_path_descend(TreePath *path, Length position) {
  uint32_t original_size = path->size;
+
  bool did_descend;
  do {
    did_descend = false;
    TreePathEntry entry = *array_back(path);
-    Length child_position = entry.position;
+    Length child_left = entry.position;
    for (uint32_t i = 0; i < entry.tree->child_count; i++) {
      Tree *child = entry.tree->children[i];
-      Length child_right_position =
-        length_add(child_position, ts_tree_total_size(child));
-      if (point_lt(position, child_right_position.extent)) {
-        TreePathEntry child_entry = { child, child_position, i };
-        if (child->visible) {
+      Length child_right = length_add(child_left, ts_tree_total_size(child));
+      if (position.bytes < child_right.bytes) {
+        TreePathEntry child_entry = { child, child_left, i };
+        if (child->visible || child->child_count == 0) {
          array_push(path, child_entry);
          return true;
        } else if (child->visible_child_count > 0) {
@ -43,39 +43,44 @@ static bool tree_path_descend(TreePath *path, TSPoint position) {
          break;
        }
      }
-      child_position = child_right_position;
+      child_left = child_right;
    }
  } while (did_descend);
+
  path->size = original_size;
  return false;
 }

 static uint32_t tree_path_advance(TreePath *path) {
  uint32_t ascend_count = 0;
+
  while (path->size > 0) {
    TreePathEntry entry = array_pop(path);
-    if (path->size == 0)
-      break;
+    if (path->size == 0) break;
    TreePathEntry parent_entry = *array_back(path);
    if (parent_entry.tree->visible) ascend_count++;
-    Length position =
-      length_add(entry.position, ts_tree_total_size(entry.tree));
+
+    Length position = length_add(entry.position, ts_tree_total_size(entry.tree));
    for (uint32_t i = entry.child_index + 1; i < parent_entry.tree->child_count; i++) {
      Tree *next_child = parent_entry.tree->children[i];
-      if (next_child->visible || next_child->visible_child_count > 0) {
+      if (next_child->visible ||
+          next_child->child_count == 0 ||
+          next_child->visible_child_count > 0) {
        if (parent_entry.tree->visible) ascend_count--;
        array_push(path, ((TreePathEntry){
          .tree = next_child,
          .child_index = i,
          .position = position,
        }));
-        if (!next_child->visible)
-          tree_path_descend(path, (TSPoint){ 0, 0 });
+        if (!next_child->visible) {
+          tree_path_descend(path, length_zero());
+        }
        return ascend_count;
      }
      position = length_add(position, ts_tree_total_size(next_child));
    }
  }
+
  return ascend_count;
 }

@ -94,8 +99,27 @@ static void tree_path_init(TreePath *path, Tree *tree) {
    .position = { 0, 0, { 0, 0 } },
    .child_index = 0,
  }));
-  if (!tree->visible)
-    tree_path_descend(path, (TSPoint){ 0, 0 });
+  if (!tree->visible) {
+    tree_path_descend(path, length_zero());
+  }
+}
+
+Tree *tree_path_visible_tree(TreePath *self) {
+  for (uint32_t i = self->size - 1; i + 1 > 0; i--) {
+    Tree *tree = self->contents[i].tree;
+    if (tree->visible) return tree;
+  }
+  return NULL;
+}
+
+Length tree_path_start_position(TreePath *self) {
+  TreePathEntry entry = *array_back(self);
+  return length_add(entry.position, entry.tree->padding);
+}
+
+Length tree_path_end_position(TreePath *self) {
+  TreePathEntry entry = *array_back(self);
+  return length_add(length_add(entry.position, entry.tree->padding), entry.tree->size);
 }

 static bool tree_must_eq(Tree *old_tree, Tree *new_tree) {
@ -112,67 +136,59 @@ static bool tree_must_eq(Tree *old_tree, Tree *new_tree) {

 static void tree_path_get_changes(TreePath *old_path, TreePath *new_path,
                                  TSRange **ranges, uint32_t *range_count) {
-  TSPoint position = { 0, 0 };
+  Length position = length_zero();
  RangeArray results = array_new();

  while (old_path->size && new_path->size) {
    bool is_changed = false;
-    TSPoint next_position = position;
+    Length next_position = position;

-    TreePathEntry old_entry = *array_back(old_path);
-    TreePathEntry new_entry = *array_back(new_path);
-    Tree *old_tree = old_entry.tree;
-    Tree *new_tree = new_entry.tree;
-    uint32_t old_start_byte = old_entry.position.bytes + old_tree->padding.bytes;
-    uint32_t new_start_byte = new_entry.position.bytes + new_tree->padding.bytes;
-    TSPoint old_start_point =
-      point_add(old_entry.position.extent, old_tree->padding.extent);
-    TSPoint new_start_point =
-      point_add(new_entry.position.extent, new_tree->padding.extent);
-    TSPoint old_end_point = point_add(old_start_point, old_tree->size.extent);
-    TSPoint new_end_point = point_add(new_start_point, new_tree->size.extent);
+    Tree *old_tree = tree_path_visible_tree(old_path);
+    Tree *new_tree = tree_path_visible_tree(new_path);
+    Length old_start = tree_path_start_position(old_path);
+    Length new_start = tree_path_start_position(new_path);
+    Length old_end = tree_path_end_position(old_path);
+    Length new_end = tree_path_end_position(new_path);

    // #define NAME(t) (ts_language_symbol_name(language, ((Tree *)(t))->symbol))
-    // printf("At [%-2lu, %-2lu] Compare (%-20s\t [%-2lu, %-2lu] - [%lu, %lu])\tvs\t(%-20s\t [%lu, %lu] - [%lu, %lu])\n",
-    //   position.row, position.column, NAME(old_tree), old_start_point.row,
-    //   old_start_point.column, old_end_point.row, old_end_point.column,
-    //   NAME(new_tree), new_start_point.row, new_start_point.column,
-    //   new_end_point.row, new_end_point.column);
+    // printf("At [%-2u, %-2u] Compare (%-20s\t [%-2u, %-2u] - [%u, %u])\tvs\t(%-20s\t [%u, %u] - [%u, %u])\n",
+    //   position.extent.row, position.extent.column,
+    //   NAME(old_tree), old_start.extent.row, old_start.extent.column, old_end.extent.row, old_end.extent.column,
+    //   NAME(new_tree), new_start.extent.row, new_start.extent.column, new_end.extent.row, new_end.extent.column);

-    if (point_lt(position, old_start_point)) {
-      if (point_lt(position, new_start_point)) {
-        next_position = point_min(old_start_point, new_start_point);
+    if (position.bytes < old_start.bytes) {
+      if (position.bytes < new_start.bytes) {
+        next_position = length_min(old_start, new_start);
      } else {
        is_changed = true;
-        next_position = old_start_point;
+        next_position = old_start;
      }
-    } else if (point_lt(position, new_start_point)) {
+    } else if (position.bytes < new_start.bytes) {
      is_changed = true;
-      next_position = new_start_point;
-    } else if (old_start_byte == new_start_byte &&
-               tree_must_eq(old_tree, new_tree)) {
-      next_position = old_end_point;
+      next_position = new_start;
+    } else if (old_start.bytes == new_start.bytes && tree_must_eq(old_tree, new_tree)) {
+      next_position = old_end;
    } else if (old_tree->symbol == new_tree->symbol) {
      if (tree_path_descend(old_path, position)) {
        if (!tree_path_descend(new_path, position)) {
          tree_path_ascend(old_path, 1);
          is_changed = true;
-          next_position = new_end_point;
+          next_position = new_end;
        }
      } else if (tree_path_descend(new_path, position)) {
        tree_path_ascend(new_path, 1);
        is_changed = true;
-        next_position = old_end_point;
+        next_position = old_end;
      } else {
-        next_position = point_min(old_end_point, new_end_point);
+        next_position = length_min(old_end, new_end);
      }
    } else {
      is_changed = true;
-      next_position = point_min(old_end_point, new_end_point);
+      next_position = length_min(old_end, new_end);
    }

-    bool at_old_end = point_lte(old_end_point, next_position);
-    bool at_new_end = point_lte(new_end_point, next_position);
+    bool at_old_end = old_end.bytes <= next_position.bytes;
+    bool at_new_end = new_end.bytes <= next_position.bytes;

    if (at_new_end && at_old_end) {
      uint32_t old_ascend_count = tree_path_advance(old_path);
@ -190,7 +206,7 @@ static void tree_path_get_changes(TreePath *old_path, TreePath *new_path,
      tree_path_ascend(new_path, ascend_count);
    }

-    if (is_changed) range_array_add(&results, position, next_position);
+    if (is_changed) range_array_add(&results, position.extent, next_position.extent);
    position = next_position;
  }

--- a/todo.md
+++ b/todo.md
@ -1,32 +0,0 @@
-TODO
-====
-
-### Handling ambiguity (GLR)
-* Add a simple way to specify syntactic ambiguity resolutions in the Grammar (e.g. 'prefer declarations to statements' in C), similar to bison's `dprec`
-construct.
-
-### Runtime System
-* Refactoring: make separate symbol for unexpected characters than for interior error nodes.
-
-### Testing / Quality
-* Start running the clang-analyzer on the codebase on Travis-CI.
-* Use the Valgrind leak checker to fix the memory leaks in the runtime library.
-* Randomize the editing in the language tests, using a seed that can be specified in order to reproduce failures.
-
-### Ubiquitous token handling
-* Fix the unintuitive tree that results when ubiquitous tokens are last child of their parent node.
-
-### Error handling
-* Use information about nesting depth of tokens like '(' and ')' to make error recovery more accurate.
-
-### Grammar Features
-* Regexp assertions
-  - [ ] '^'
-  - [ ] '$'
-  - [ ] '\b'
-* Composing languages
-  - [ ] Rule for referencing named grammar
-  - [ ] Grammar registry object in runtime
-  - [ ] Parsing returns control to parent language
-* Indentation tokens
-