From c966af041235e42d207e4150cc1ba8cb2ec85c78 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 30 Nov 2016 09:34:47 -0800
Subject: [PATCH] Start work on external tokens

---
 include/tree_sitter/parser.h                  |  45 ++--
 project.gyp                                   |   1 +
 .../build_tables/distinctive_tokens_spec.cc   |   2 +-
 .../build_tables/lex_conflict_manager_spec.cc |   8 +-
 spec/compiler/build_tables/lex_item_spec.cc   |  92 ++++-----
 .../parse_item_set_builder_spec.cc            |  62 +++---
 .../prepare_grammar/extract_tokens_spec.cc    |   8 +-
 .../prepare_grammar/flatten_grammar_spec.cc   |  28 +--
 spec/compiler/rules/repeat_spec.cc            |   2 +-
 .../external_scanners/external_scan.c         |  13 ++
 spec/helpers/load_language.cc                 |   9 +-
 spec/helpers/load_language.h                  |   3 +-
 spec/helpers/rule_helpers.cc                  |   5 +-
 spec/helpers/stream_methods.cc                |  11 +-
 spec/integration/compile_grammar_spec.cc      |  65 ++++++
 spec/integration/corpus_specs.cc              |   6 +-
 src/compiler/build_tables/build_lex_table.cc  |  66 +++---
 .../build_tables/build_parse_table.cc         |  80 ++++----
 src/compiler/build_tables/lookahead_set.cc    |  12 +-
 src/compiler/build_tables/lookahead_set.h     |   8 +-
 src/compiler/build_tables/parse_item.cc       |  34 +--
 src/compiler/build_tables/parse_item.h        |  26 ---
 .../build_tables/parse_item_set_builder.cc    |  22 +-
 src/compiler/build_tables/recovery_tokens.cc  |   6 +-
 src/compiler/build_tables/recovery_tokens.h   |   2 +-
 src/compiler/generate_code/c_code.cc          | 194 +++++++++++++-----
 src/compiler/grammar.h                        |   1 +
 src/compiler/parse_grammar.cc                 |  31 ++-
 src/compiler/parse_table.cc                   |  54 ++---
 src/compiler/parse_table.h                    |   9 +-
 .../prepare_grammar/expand_repeats.cc         |   3 +-
 .../prepare_grammar/extract_tokens.cc         |  31 +--
 .../prepare_grammar/flatten_grammar.cc        |   1 +
 .../prepare_grammar/initial_syntax_grammar.h  |   6 +-
 .../prepare_grammar/intern_symbols.cc         |  31 ++-
 .../prepare_grammar/interned_grammar.h        |   1 +
 src/compiler/rules.h                          |   1 +
 src/compiler/rules/built_in_symbols.cc        |   6 +-
 src/compiler/rules/external_token.cc          |  39 ++++
 src/compiler/rules/external_token.h           |  27 +++
 src/compiler/rules/rules.cc                   |   5 +
 src/compiler/rules/symbol.cc                  |  34 ++-
 src/compiler/rules/symbol.h                   |  13 +-
 src/compiler/rules/visitor.h                  |  16 ++
 src/compiler/syntax_grammar.cc                |  11 -
 src/compiler/syntax_grammar.h                 |   3 +-
 src/runtime/parser.c                          |   7 +-
 47 files changed, 723 insertions(+), 417 deletions(-)
 create mode 100644 spec/fixtures/external_scanners/external_scan.c
 create mode 100644 src/compiler/rules/external_token.cc
 create mode 100644 src/compiler/rules/external_token.h

diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h
index 3a5bab9a..a335dd6d 100644
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@@ -48,6 +48,11 @@ typedef struct {
   bool fragile : 1;
 } TSParseAction;
 
+typedef struct {
+  uint16_t lex_state;
+  uint16_t external_tokens;
+} TSLexMode;
+
 typedef union {
   TSParseAction action;
   struct {
@@ -64,8 +69,15 @@ typedef struct TSLanguage {
   const TSSymbolMetadata *symbol_metadata;
   const unsigned short *parse_table;
   const TSParseActionEntry *parse_actions;
-  const TSStateId *lex_states;
+  const TSLexMode *lex_modes;
   bool (*lex_fn)(TSLexer *, TSStateId);
+  const TSSymbol *external_token_symbol_map;
+  const bool *external_token_lists;
+  struct {
+    void * (*create)();
+    bool (*scan)(TSLexer *, const bool *symbol_whitelist);
+    void (*destroy)(void *);
+  } external_scanner;
 } TSLanguage;
 
 /*
@@ -146,21 +158,22 @@ typedef struct TSLanguage {
     { .type = TSParseActionTypeAccept } \
   }
 
-#define EXPORT_LANGUAGE(language_name)                     \
-  static TSLanguage language = {                           \
-    .symbol_count = SYMBOL_COUNT,                          \
-    .token_count = TOKEN_COUNT,                            \
-    .symbol_metadata = ts_symbol_metadata,                 \
-    .parse_table = (const unsigned short *)ts_parse_table, \
-    .parse_actions = ts_parse_actions,                     \
-    .lex_states = ts_lex_states,                           \
-    .symbol_names = ts_symbol_names,                       \
-    .lex_fn = ts_lex,                                      \
-  };                                                       \
-                                                           \
-  const TSLanguage *language_name() {                      \
-    return &language;                                      \
-  }
+
+#define GET_LANGUAGE(...)                                          \
+  static TSLanguage language = {                                   \
+    .symbol_count = SYMBOL_COUNT,                                  \
+    .token_count = TOKEN_COUNT,                                    \
+    .symbol_metadata = ts_symbol_metadata,                         \
+    .parse_table = (const unsigned short *)ts_parse_table,         \
+    .parse_actions = ts_parse_actions,                             \
+    .lex_modes = ts_lex_modes,                                     \
+    .symbol_names = ts_symbol_names,                               \
+    .lex_fn = ts_lex,                                              \
+    .external_token_lists = (const bool *)ts_external_token_lists, \
+    .external_token_symbol_map = ts_external_token_symbol_map,     \
+    .external_scanner = {__VA_ARGS__}                              \
+  };                                                               \
+  return &language                                                 \
 
 #ifdef __cplusplus
 }
diff --git a/project.gyp b/project.gyp
index 081a3a88..29b69787 100644
--- a/project.gyp
+++ b/project.gyp
@@ -47,6 +47,7 @@
         'src/compiler/rules/character_range.cc',
         'src/compiler/rules/character_set.cc',
         'src/compiler/rules/choice.cc',
+        'src/compiler/rules/external_token.cc',
         'src/compiler/rules/metadata.cc',
         'src/compiler/rules/named_symbol.cc',
         'src/compiler/rules/pattern.cc',
diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc
index 104cd721..f01d76cb 100644
--- a/spec/compiler/build_tables/distinctive_tokens_spec.cc
+++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc
@@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() {
       })),
     };
 
-    AssertThat(recovery_tokens(grammar), Equals<set<Symbol::Index>>({ 1 }));
+    AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
   });
 });
 
diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc
index 7f43e175..3aa75a4c 100644
--- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc
+++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc
@@ -14,10 +14,10 @@ START_TEST
 describe("LexConflictManager::resolve(new_action, old_action)", []() {
   LexConflictManager conflict_manager;
   bool update;
-  Symbol sym1(0, true);
-  Symbol sym2(1, true);
-  Symbol sym3(2, true);
-  Symbol sym4(3, true);
+  Symbol sym1(0, Symbol::Terminal);
+  Symbol sym2(1, Symbol::Terminal);
+  Symbol sym3(2, Symbol::Terminal);
+  Symbol sym4(3, Symbol::Terminal);
   LexItemSet item_set({ LexItem(sym4, blank() )});
 
   it("favors advance actions over empty accept token actions", [&]() {
diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc
index 94997956..7042922f 100644
--- a/spec/compiler/build_tables/lex_item_spec.cc
+++ b/spec/compiler/build_tables/lex_item_spec.cc
@@ -14,7 +14,7 @@ START_TEST
 describe("LexItem", []() {
   describe("completion_status()", [&]() {
     it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
-      LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' }));
+      LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
       AssertThat(item1.completion_status().is_done, IsFalse());
       AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
       AssertThat(item1.completion_status().is_string, IsFalse());
@@ -23,7 +23,7 @@ describe("LexItem", []() {
       params.precedence = 3;
       params.has_precedence = true;
       params.is_string = 1;
-      LexItem item2(Symbol(0, true), choice({
+      LexItem item2(Symbol(0, Symbol::Terminal), choice({
         metadata(blank(), params),
         character({ 'a', 'b', 'c' })
       }));
@@ -32,7 +32,7 @@ describe("LexItem", []() {
       AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
       AssertThat(item2.completion_status().is_string, IsTrue());
 
-      LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' })));
+      LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
       AssertThat(item3.completion_status().is_done, IsTrue());
       AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
       AssertThat(item3.completion_status().is_string, IsFalse());
@@ -43,7 +43,7 @@ describe("LexItem", []() {
 describe("LexItemSet::transitions()", [&]() {
   it("handles single characters", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), character({ 'x' })),
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
     });
 
     AssertThat(
@@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('x'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), blank()),
+              LexItem(Symbol(1, Symbol::NonTerminal), blank()),
             }),
             PrecedenceRange(),
             false
@@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() {
     params.is_main_token = true;
 
     LexItemSet item_set({
-      LexItem(Symbol(1), metadata(character({ 'x' }), params)),
+      LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)),
     });
 
     AssertThat(
@@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('x'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), metadata(blank(), params)),
+              LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)),
             }),
             PrecedenceRange(),
             true
@@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles sequences", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
         character({ 'w' }),
         character({ 'x' }),
         character({ 'y' }),
@@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('w'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 character({ 'x' }),
                 character({ 'y' }),
                 character({ 'z' }),
@@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles sequences with nested precedence", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
         prec(3, seq({
           character({ 'v' }),
           prec(4, seq({
@@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() {
             // The outer precedence is now 'active', because we are within its
             // contained rule.
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 active_prec(3, seq({
                   prec(4, seq({
                     character({ 'w' }),
@@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() {
           Transition{
             // The inner precedence is now 'active'
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 active_prec(3, seq({
                   active_prec(4, character({ 'x' })),
                   character({ 'y' }) })),
@@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('x'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 active_prec(3, character({ 'y' })),
                 character({ 'z' }),
               })),
@@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('y'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ 'z' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
             }),
             PrecedenceRange(3),
             false
@@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles sequences where the left hand side can be blank", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
         choice({
           character({ 'x' }),
           blank(),
@@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('x'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 character({ 'y' }),
                 character({ 'z' }),
               })),
@@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('y'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ 'z' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
             }),
             PrecedenceRange(),
             false
@@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles blanks", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), blank()),
+      LexItem(Symbol(1, Symbol::NonTerminal), blank()),
     });
 
     AssertThat(item_set.transitions(), IsEmpty());
@@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles repeats", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), repeat1(seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({
         character({ 'a' }),
         character({ 'b' }),
       }))),
-      LexItem(Symbol(2), repeat1(character({ 'c' }))),
+      LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
     });
 
     AssertThat(
@@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('a'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), seq({
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({
                 character({ 'b' }),
                 repeat1(seq({
                   character({ 'a' }),
                   character({ 'b' }),
                 }))
               })),
-              LexItem(Symbol(1), character({ 'b' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })),
             }),
             PrecedenceRange(),
             false
@@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('c'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(2), repeat1(character({ 'c' }))),
-              LexItem(Symbol(2), blank()),
+              LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
+              LexItem(Symbol(2, Symbol::NonTerminal), blank()),
             }),
             PrecedenceRange(),
             false
@@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles repeats with precedence", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' }))))
+      LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' }))))
     });
 
     AssertThat(
@@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('a'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))),
-              LexItem(Symbol(1), active_prec(-1, blank())),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())),
             }),
             PrecedenceRange(-1),
             false
@@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles choices between overlapping character sets", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), choice({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
         active_prec(2, seq({
           character({ 'a', 'b', 'c', 'd'  }),
           character({ 'x' }),
@@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('a', 'b'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
             }),
             PrecedenceRange(2),
             false
@@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('c', 'd'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
-              LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
             }),
             PrecedenceRange(2, 3),
             false
@@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('e', 'f'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
+              LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
             }),
             PrecedenceRange(3),
             false
@@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles choices between a subset and a superset of characters", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), choice({
+      LexItem(Symbol(1, Symbol::NonTerminal), choice({
         seq({
           character({ 'b', 'c', 'd' }),
           character({ 'x' }),
@@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('a').include('e', 'f'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ 'y' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
             }),
             PrecedenceRange(),
             false
@@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('b', 'd'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ 'x' })),
-              LexItem(Symbol(1), character({ 'y' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
             }),
             PrecedenceRange(),
             false
@@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles choices between whitelisted and blacklisted character sets", [&]() {
     LexItemSet item_set({
-      LexItem(Symbol(1), seq({
+      LexItem(Symbol(1, Symbol::NonTerminal), seq({
         choice({
           character({ '/' }, false),
           seq({
@@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include_all().exclude('/').exclude('\\'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
             }),
             PrecedenceRange(),
             false
@@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() {
           CharacterSet().include('\\'),
           Transition{
             LexItemSet({
-              LexItem(Symbol(1), character({ '/' })),
-              LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })),
+              LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
+              LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })),
             }),
             PrecedenceRange(),
             false
@@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() {
 
   it("handles different items with overlapping character sets", [&]() {
     LexItemSet set1({
-      LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
-      LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' }))
+      LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
+      LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' }))
     });
 
     AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({
@@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() {
         CharacterSet().include('a', 'd'),
         Transition{
           LexItemSet({
-            LexItem(Symbol(1), blank()),
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
           }),
           PrecedenceRange(),
           false
@@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() {
         CharacterSet().include('e', 'f'),
         Transition{
           LexItemSet({
-            LexItem(Symbol(1), blank()),
-            LexItem(Symbol(2), blank()),
+            LexItem(Symbol(1, Symbol::NonTerminal), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
           }),
           PrecedenceRange(),
           false
@@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() {
         CharacterSet().include('g', 'i'),
         Transition{
           LexItemSet({
-            LexItem(Symbol(2), blank()),
+            LexItem(Symbol(2, Symbol::NonTerminal), blank()),
           }),
           PrecedenceRange(),
           false
diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc
index a1dd2231..dad0976b 100644
--- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc
+++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc
@@ -27,23 +27,23 @@ describe("ParseItemSetBuilder", []() {
     SyntaxGrammar grammar{{
       SyntaxVariable("rule0", VariableTypeNamed, {
         Production({
-          {Symbol(1), 0, AssociativityNone},
-          {Symbol(11, true), 0, AssociativityNone},
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
         }),
       }),
       SyntaxVariable("rule1", VariableTypeNamed, {
         Production({
-          {Symbol(12, true), 0, AssociativityNone},
-          {Symbol(13, true), 0, AssociativityNone},
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
         }),
         Production({
-          {Symbol(2), 0, AssociativityNone},
+          {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
         })
       }),
       SyntaxVariable("rule2", VariableTypeNamed, {
         Production({
-          {Symbol(14, true), 0, AssociativityNone},
-          {Symbol(15, true), 0, AssociativityNone},
+          {Symbol(14, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(15, Symbol::Terminal), 0, AssociativityNone},
         })
       }),
     }, {}, {}};
@@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() {
 
     ParseItemSet item_set({
       {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 }),
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
       }
     });
 
@@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() {
 
     AssertThat(item_set, Equals(ParseItemSet({
       {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 })
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
+        },
+      {
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
       },
       {
-        ParseItem(Symbol(1), production(1, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
       },
       {
-        ParseItem(Symbol(1), production(1, 1), 0),
-        LookaheadSet({ 11 })
-      },
-      {
-        ParseItem(Symbol(2), production(2, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
       },
     })));
   });
@@ -86,14 +86,14 @@ describe("ParseItemSetBuilder", []() {
     SyntaxGrammar grammar{{
       SyntaxVariable("rule0", VariableTypeNamed, {
         Production({
-          {Symbol(1), 0, AssociativityNone},
-          {Symbol(11, true), 0, AssociativityNone},
+          {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+          {Symbol(11, Symbol::Terminal), 0, AssociativityNone},
         }),
       }),
       SyntaxVariable("rule1", VariableTypeNamed, {
         Production({
-          {Symbol(12, true), 0, AssociativityNone},
-          {Symbol(13, true), 0, AssociativityNone},
+          {Symbol(12, Symbol::Terminal), 0, AssociativityNone},
+          {Symbol(13, Symbol::Terminal), 0, AssociativityNone},
         }),
         Production({})
       }),
@@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() {
 
     ParseItemSet item_set({
       {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 }),
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) }),
       }
     });
 
@@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() {
 
     AssertThat(item_set, Equals(ParseItemSet({
       {
-        ParseItem(Symbol(0), production(0, 0), 0),
-        LookaheadSet({ 10 })
+        ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
+        LookaheadSet({ Symbol(10, Symbol::Terminal) })
       },
       {
-        ParseItem(Symbol(1), production(1, 0), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
       },
       {
-        ParseItem(Symbol(1), production(1, 1), 0),
-        LookaheadSet({ 11 })
+        ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
+        LookaheadSet({ Symbol(11, Symbol::Terminal) })
       },
     })));
   });
diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
index 9f871ec4..577dead1 100644
--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@@ -133,13 +133,13 @@ describe("extract_tokens", []() {
       Variable("rule_A", VariableTypeNamed, str("ok")),
       Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
       Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
-    }, { str(" ") }, { { Symbol(1), Symbol(2) } }});
+    }, { str(" ") }, { { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } }});
 
     InitialSyntaxGrammar &syntax_grammar = get<0>(result);
 
     AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
     AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
-      { Symbol(0), Symbol(1) },
+      { Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) },
     })));
   });
 
@@ -171,7 +171,7 @@ describe("extract_tokens", []() {
 
       AssertThat(get<2>(result), Equals(CompileError::none()));
       AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
-      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, true) })));
+      AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, Symbol::Terminal) })));
     });
 
     it("updates extra symbols according to the new symbol numbers", [&]() {
@@ -186,7 +186,7 @@ describe("extract_tokens", []() {
       AssertThat(get<2>(result), Equals(CompileError::none()));
 
       AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
-        { Symbol(3, true) },
+        { Symbol(3, Symbol::Terminal) },
       })));
 
       AssertThat(get<1>(result).separators, IsEmpty());
diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
index 3efd4e03..823da8e6 100644
--- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
+++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc
@@ -36,19 +36,19 @@ describe("flatten_grammar", []() {
     AssertThat(result.type, Equals(VariableTypeNamed));
     AssertThat(result.productions, Equals(vector<Production>({
       Production({
-        {Symbol(1), 0, AssociativityNone},
-        {Symbol(2), 101, AssociativityLeft},
-        {Symbol(3), 102, AssociativityRight},
-        {Symbol(4), 101, AssociativityLeft},
-        {Symbol(6), 0, AssociativityNone},
-        {Symbol(7), 0, AssociativityNone},
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(3, Symbol::NonTerminal), 102, AssociativityRight},
+        {Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
       }),
       Production({
-        {Symbol(1), 0, AssociativityNone},
-        {Symbol(2), 101, AssociativityLeft},
-        {Symbol(5), 101, AssociativityLeft},
-        {Symbol(6), 0, AssociativityNone},
-        {Symbol(7), 0, AssociativityNone},
+        {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
+        {Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
       })
     })))
   });
@@ -65,8 +65,8 @@ describe("flatten_grammar", []() {
 
     AssertThat(result.productions, Equals(vector<Production>({
       Production({
-        {Symbol(1), 101, AssociativityLeft},
-        {Symbol(2), 101, AssociativityLeft},
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
+        {Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
       })
     })))
 
@@ -80,7 +80,7 @@ describe("flatten_grammar", []() {
 
     AssertThat(result.productions, Equals(vector<Production>({
       Production({
-        {Symbol(1), 101, AssociativityLeft},
+        {Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
       })
     })))
   });
diff --git a/spec/compiler/rules/repeat_spec.cc b/spec/compiler/rules/repeat_spec.cc
index 63680563..9c84c8e5 100644
--- a/spec/compiler/rules/repeat_spec.cc
+++ b/spec/compiler/rules/repeat_spec.cc
@@ -9,7 +9,7 @@ START_TEST
 describe("Repeat", []() {
   describe("constructing repeats", [&]() {
     it("doesn't create redundant repeats", [&]() {
-      auto sym = make_shared<Symbol>(1);
+      auto sym = make_shared<Symbol>(1, Symbol::NonTerminal);
       auto repeat = Repeat::build(sym);
       auto outer_repeat = Repeat::build(repeat);
 
diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/external_scan.c
new file mode 100644
index 00000000..7abab3ae
--- /dev/null
+++ b/spec/fixtures/external_scanners/external_scan.c
@@ -0,0 +1,13 @@
+#include <stdbool.h>
+
+void *ts_language_external_scanner_example_external_scanner_create() {
+  puts("HELLO FROM EXTERNAL SCANNER");
+  return 0;
+}
+
+bool ts_language_external_scanner_example_external_scanner_scan() {
+  return true;
+}
+
+void ts_language_external_scanner_example_external_scanner_destroy() {
+}
diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc
index a29aa240..2e85b762 100644
--- a/spec/helpers/load_language.cc
+++ b/spec/helpers/load_language.cc
@@ -67,7 +67,8 @@ static int get_modified_time(const string &path) {
 
 const TSLanguage *load_language(const string &source_filename,
                                 const string &lib_filename,
-                                const string &language_name) {
+                                const string &language_name,
+                                string external_scanner_path = "") {
   string language_function_name = "ts_language_" + language_name;
   string header_dir = getenv("PWD") + string("/include");
   int source_mtime = get_modified_time(source_filename);
@@ -119,7 +120,9 @@ const TSLanguage *load_language(const string &source_filename,
   return language_fn();
 }
 
-const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) {
+const TSLanguage *load_compile_result(const string &name,
+                                      const TSCompileResult &compile_result,
+                                      string external_scanner_path) {
   if (compile_result.error_type != TSCompileErrorTypeNone) {
     Assert::Failure(string("Compilation failed ") + compile_result.error_message);
     return nullptr;
@@ -135,7 +138,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult
   source_file << compile_result.code;
   source_file.close();
 
-  const TSLanguage *language = load_language(source_filename, lib_filename, name);
+  auto language = load_language(source_filename, lib_filename, name, external_scanner_path);
   free(compile_result.code);
   return language;
 }
diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h
index 41b1458e..41d8b739 100644
--- a/spec/helpers/load_language.h
+++ b/spec/helpers/load_language.h
@@ -5,7 +5,8 @@
 #include "tree_sitter/runtime.h"
 #include <string>
 
-const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &);
+const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &,
+                                      std::string external_scanner_path = "");
 const TSLanguage *get_test_language(const std::string &language_name);
 
 #endif  // HELPERS_LOAD_LANGUAGE_H_
diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc
index 8bf32360..0b010d2e 100644
--- a/spec/helpers/rule_helpers.cc
+++ b/spec/helpers/rule_helpers.cc
@@ -9,6 +9,7 @@ namespace tree_sitter {
   using std::ostream;
   using std::string;
   using std::to_string;
+  using rules::Symbol;
 
   rule_ptr character(const set<uint32_t> &ranges) {
     return character(ranges, true);
@@ -28,11 +29,11 @@ namespace tree_sitter {
   }
 
   rule_ptr i_sym(size_t index) {
-    return make_shared<rules::Symbol>(index);
+    return make_shared<Symbol>(index, Symbol::NonTerminal);
   }
 
   rule_ptr i_token(size_t index) {
-    return make_shared<rules::Symbol>(index, true);
+    return make_shared<Symbol>(index, Symbol::Terminal);
   }
 
   rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) {
diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc
index 4d411d66..b47363a0 100644
--- a/spec/helpers/stream_methods.cc
+++ b/spec/helpers/stream_methods.cc
@@ -10,16 +10,7 @@ namespace tree_sitter {
 
 ostream &operator<<(ostream &stream, const Grammar &grammar) {
   stream << string("#<grammar");
-  stream << string(" rules: {");
-  bool started = false;
-  for (auto pair : grammar.rules) {
-    if (started)
-      stream << string(", ");
-    stream << pair.first;
-    stream << string(" => ");
-    stream << pair.second;
-    started = true;
-  }
+  stream << " rules: " << grammar.rules;
   return stream << string("}>");
 }
 
diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc
index d41d76e4..21307c89 100644
--- a/spec/integration/compile_grammar_spec.cc
+++ b/spec/integration/compile_grammar_spec.cc
@@ -507,6 +507,71 @@ describe("compile_grammar", []() {
     });
   });
 
+  describe("external scanners", [&]() {
+    it("can call out to arbitrary scanner functions during parsing", [&]() {
+      string grammar = R"JSON({
+        "name": "external_scanner_example",
+
+        "externals": [
+          "percent_string",
+          "percent_string_start",
+          "percent_string_end"
+        ],
+
+        "rules": {
+          "string": {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "EXTERNAL_TOKEN",
+                "name": "percent_string"
+              },
+              {
+                "type": "SEQ",
+                "members": [
+                  {
+                    "type": "EXTERNAL_TOKEN",
+                    "name": "percent_string_start"
+                  },
+                  {
+                    "type": "SYMBOL",
+                    "name": "identifier"
+                  },
+                  {
+                    "type": "EXTERNAL_TOKEN",
+                    "name": "percent_string_end"
+                  }
+                ]
+              },
+            ]
+          },
+
+          "identifier": {
+            "type": "PATTERN",
+            "value": "\\a+"
+          }
+        }
+      })JSON";
+
+      TSCompileResult result = ts_compile_grammar(grammar.c_str());
+      AssertThat(result.error_message, IsNull());
+
+      ts_document_set_language(document, load_compile_result(
+        "external_scanner_example",
+        result,
+        "spec/fixtures/external_scanners/external_scan.c"
+      ));
+
+      ts_document_set_input_string(document, "%|hi|");
+      ts_document_parse(document);
+      assert_root_node("(string)");
+
+      ts_document_set_input_string(document, "%(1 #{two} three)");
+      ts_document_parse(document);
+      assert_root_node("(string (identifier))");
+    });
+  });
+
   describe("when the grammar's start symbol is a token", [&]() {
     it("parses the token", [&]() {
       TSCompileResult result = ts_compile_grammar(R"JSON(
diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc
index 9d716ed1..86a1dc47 100644
--- a/spec/integration/corpus_specs.cc
+++ b/spec/integration/corpus_specs.cc
@@ -80,10 +80,10 @@ START_TEST
 
 describe("The Corpus", []() {
   vector<string> test_languages({
-    "javascript",
+    // "javascript",
     "json",
-    "c",
-    "cpp",
+    // "c",
+    // "cpp",
   });
 
   for (auto &language_name : test_languages) {
diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc
index 151da7cf..29d8f4d0 100644
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@@ -64,7 +64,7 @@ class LexTableBuilder {
  private:
   void add_lex_state_for_parse_state(ParseState *parse_state) {
     parse_state->lex_state_id =
-      add_lex_state(item_set_for_tokens(parse_state->expected_inputs()));
+      add_lex_state(item_set_for_terminals(parse_state->terminal_entries));
   }
 
   LexStateId add_lex_state(const LexItemSet &item_set) {
@@ -112,24 +112,27 @@ class LexTableBuilder {
   void mark_fragile_tokens() {
     for (ParseState &state : parse_table->states) {
       for (auto &entry : state.terminal_entries) {
-        auto homonyms = conflict_manager.possible_homonyms.find(entry.first);
-        if (homonyms != conflict_manager.possible_homonyms.end())
-          for (Symbol::Index homonym : homonyms->second)
-            if (state.terminal_entries.count(homonym)) {
-              entry.second.reusable = false;
-              break;
-            }
+        Symbol symbol = entry.first;
+        if (symbol.is_token()) {
+          auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
+          if (homonyms != conflict_manager.possible_homonyms.end())
+            for (Symbol::Index homonym : homonyms->second)
+              if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
+                entry.second.reusable = false;
+                break;
+              }
 
-        if (!entry.second.reusable)
-          continue;
+          if (!entry.second.reusable)
+            continue;
 
-        auto extensions = conflict_manager.possible_extensions.find(entry.first);
-        if (extensions != conflict_manager.possible_extensions.end())
-          for (Symbol::Index extension : extensions->second)
-            if (state.terminal_entries.count(extension)) {
-              entry.second.depends_on_lookahead = true;
-              break;
-            }
+          auto extensions = conflict_manager.possible_extensions.find(symbol.index);
+          if (extensions != conflict_manager.possible_extensions.end())
+            for (Symbol::Index extension : extensions->second)
+              if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
+                entry.second.depends_on_lookahead = true;
+                break;
+              }
+        }
       }
     }
   }
@@ -150,24 +153,27 @@ class LexTableBuilder {
     }
   }
 
-  LexItemSet item_set_for_tokens(const set<Symbol> &symbols) {
+  LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
     LexItemSet result;
-    for (const Symbol &symbol : symbols)
-      for (const rule_ptr &rule : rules_for_symbol(symbol))
-        for (const rule_ptr &separator_rule : separator_rules)
-          result.entries.insert(LexItem(
-            symbol,
-            Metadata::separator(
-              Seq::build({
-                separator_rule,
-                Metadata::main_token(rule) }))));
+    for (const auto &pair : terminals) {
+      Symbol symbol = pair.first;
+      if (symbol.is_token()) {
+        for (const rule_ptr &rule : rules_for_symbol(symbol)) {
+          for (const rule_ptr &separator_rule : separator_rules) {
+            result.entries.insert(LexItem(
+              symbol,
+              Metadata::separator(
+                Seq::build({
+                  separator_rule,
+                  Metadata::main_token(rule) }))));
+          }
+        }
+      }
+    }
     return result;
   }
 
   vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
-    if (!symbol.is_token)
-      return {};
-
     if (symbol == rules::END_OF_INPUT())
       return { CharacterSet().include(0).copy() };
 
diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc
index 91444310..819ce345 100644
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@@ -52,7 +52,10 @@ class ParseTableBuilder {
         allow_any_conflict(false) {}
 
   pair<ParseTable, CompileError> build() {
-    Symbol start_symbol = Symbol(0, grammar.variables.empty());
+    Symbol start_symbol = grammar.variables.empty() ?
+      Symbol(0, Symbol::Terminal) :
+      Symbol(0, Symbol::NonTerminal);
+
     Production start_production({
       ProductionStep(start_symbol, 0, rules::AssociativityNone),
     });
@@ -63,7 +66,7 @@ class ParseTableBuilder {
     add_parse_state(ParseItemSet({
       {
         ParseItem(rules::START(), start_production, 0),
-        LookaheadSet({ END_OF_INPUT().index }),
+        LookaheadSet({ END_OF_INPUT() }),
       },
     }));
 
@@ -107,21 +110,21 @@ class ParseTableBuilder {
   void build_error_parse_state() {
     ParseState error_state;
 
-    for (const Symbol::Index index : parse_table.mergeable_symbols) {
-      add_out_of_context_parse_state(&error_state, Symbol(index, true));
+    for (const Symbol symbol : parse_table.mergeable_symbols) {
+      add_out_of_context_parse_state(&error_state, symbol);
     }
 
     for (const Symbol &symbol : grammar.extra_tokens) {
-      if (!error_state.terminal_entries.count(symbol.index)) {
-        error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra());
+      if (!error_state.terminal_entries.count(symbol)) {
+        error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra());
       }
     }
 
     for (size_t i = 0; i < grammar.variables.size(); i++) {
-      add_out_of_context_parse_state(&error_state, Symbol(i, false));
+      add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal));
     }
 
-    error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0));
+    error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0));
     parse_table.states[0] = error_state;
   }
 
@@ -130,10 +133,10 @@ class ParseTableBuilder {
     const ParseItemSet &item_set = recovery_states[symbol];
     if (!item_set.entries.empty()) {
       ParseStateId state = add_parse_state(item_set);
-      if (symbol.is_token) {
-        error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) });
-      } else {
+      if (symbol.is_non_terminal()) {
         error_state->nonterminal_entries[symbol.index] = state;
+      } else {
+        error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) });
       }
     }
   }
@@ -152,9 +155,9 @@ class ParseTableBuilder {
   }
 
   string add_actions(const ParseItemSet &item_set, ParseStateId state_id) {
-    map<Symbol::Index, ParseItemSet> terminal_successors;
+    map<Symbol, ParseItemSet> terminal_successors;
     map<Symbol::Index, ParseItemSet> nonterminal_successors;
-    set<Symbol::Index> lookaheads_with_conflicts;
+    set<Symbol> lookaheads_with_conflicts;
 
     for (const auto &pair : item_set.entries) {
       const ParseItem &item = pair.first;
@@ -168,7 +171,7 @@ class ParseTableBuilder {
           ParseAction::Reduce(item.lhs(), item.step_index, *item.production);
 
         int precedence = item.precedence();
-        for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
+        for (Symbol lookahead : *lookahead_symbols.entries) {
           ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
 
           // Only add the highest-precedence Reduce actions to the parse table.
@@ -203,10 +206,10 @@ class ParseTableBuilder {
         Symbol symbol = item.production->at(item.step_index).symbol;
         ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
 
-        if (symbol.is_token) {
-          terminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
-        } else {
+        if (symbol.is_non_terminal()) {
           nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
+        } else {
+          terminal_successors[symbol].entries[new_item] = lookahead_symbols;
         }
       }
     }
@@ -214,7 +217,7 @@ class ParseTableBuilder {
     // Add a Shift action for each possible successor state. Shift actions for
     // terminal lookaheads can conflict with Reduce actions added previously.
     for (auto &pair : terminal_successors) {
-      Symbol::Index lookahead = pair.first;
+      Symbol lookahead = pair.first;
       ParseItemSet &next_item_set = pair.second;
       ParseStateId next_state_id = add_parse_state(next_item_set);
       ParseState &state = parse_table.states[state_id];
@@ -223,7 +226,7 @@ class ParseTableBuilder {
       if (!allow_any_conflict) {
         if (had_existing_action)
           lookaheads_with_conflicts.insert(lookahead);
-        recovery_states[Symbol(lookahead, true)].add(next_item_set);
+        recovery_states[lookahead].add(next_item_set);
       }
     }
 
@@ -234,10 +237,10 @@ class ParseTableBuilder {
       ParseStateId next_state = add_parse_state(next_item_set);
       parse_table.set_nonterminal_action(state_id, lookahead, next_state);
       if (!allow_any_conflict)
-        recovery_states[Symbol(lookahead, false)].add(next_item_set);
+        recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set);
     }
 
-    for (Symbol::Index lookahead : lookaheads_with_conflicts) {
+    for (Symbol lookahead : lookaheads_with_conflicts) {
       string conflict = handle_conflict(item_set, state_id, lookahead);
       if (!conflict.empty()) return conflict;
     }
@@ -245,9 +248,9 @@ class ParseTableBuilder {
     ParseAction shift_extra = ParseAction::ShiftExtra();
     ParseState &state = parse_table.states[state_id];
     for (const Symbol &extra_symbol : grammar.extra_tokens) {
-      if (!state.terminal_entries.count(extra_symbol.index) ||
+      if (!state.terminal_entries.count(extra_symbol) ||
           state.has_shift_action() || allow_any_conflict) {
-        parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra);
+        parse_table.add_terminal_action(state_id, extra_symbol, shift_extra);
       }
     }
 
@@ -257,7 +260,6 @@ class ParseTableBuilder {
   void mark_fragile_actions() {
     for (ParseState &state : parse_table.states) {
       for (auto &entry : state.terminal_entries) {
-        const Symbol symbol(entry.first, true);
         auto &actions = entry.second.actions;
 
         for (ParseAction &action : actions) {
@@ -359,7 +361,7 @@ class ParseTableBuilder {
   }
 
   string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
-                         Symbol::Index lookahead) {
+                         Symbol lookahead) {
     ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
     int reduction_precedence = entry.actions.front().precedence();
     set<ParseItem> shift_items;
@@ -468,7 +470,7 @@ class ParseTableBuilder {
       description += "  " + symbol_name(earliest_starting_item.production->at(i).symbol);
     }
 
-    description += "  \u2022  " + symbol_name(Symbol(lookahead, true)) + "  \u2026";
+    description += "  \u2022  " + symbol_name(lookahead) + "  \u2026";
     description += "\n\n";
 
     description += "Possible interpretations:\n\n";
@@ -487,7 +489,7 @@ class ParseTableBuilder {
           description += "  " + symbol_name(step.symbol);
         }
         description += ")";
-        description += "  \u2022  " + symbol_name(Symbol(lookahead, true)) + "  \u2026";
+        description += "  \u2022  " + symbol_name(lookahead) + "  \u2026";
         description += "\n";
       }
     }
@@ -564,14 +566,22 @@ class ParseTableBuilder {
         return "END_OF_INPUT";
       else
         return "";
-    } else if (symbol.is_token) {
-      const Variable &variable = lexical_grammar.variables[symbol.index];
-      if (variable.type == VariableTypeNamed)
-        return variable.name;
-      else
-        return "'" + variable.name + "'";
-    } else {
-      return grammar.variables[symbol.index].name;
+    }
+
+    switch (symbol.type) {
+      case Symbol::Terminal: {
+        const Variable &variable = lexical_grammar.variables[symbol.index];
+        if (variable.type == VariableTypeNamed)
+          return variable.name;
+        else
+          return "'" + variable.name + "'";
+      }
+      case Symbol::NonTerminal: {
+        return grammar.variables[symbol.index].name;
+      }
+      case Symbol::External: {
+        return grammar.external_tokens[symbol.index];
+      }
     }
   }
 
diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc
index 1ecb0baf..239bc029 100644
--- a/src/compiler/build_tables/lookahead_set.cc
+++ b/src/compiler/build_tables/lookahead_set.cc
@@ -12,8 +12,8 @@ using rules::Symbol;
 
 LookaheadSet::LookaheadSet() : entries(nullptr) {}
 
-LookaheadSet::LookaheadSet(const set<Symbol::Index> &symbols)
-    : entries(make_shared<set<Symbol::Index>>(symbols)) {}
+LookaheadSet::LookaheadSet(const set<Symbol> &symbols)
+    : entries(make_shared<set<Symbol>>(symbols)) {}
 
 bool LookaheadSet::empty() const {
   return !entries.get() || entries->empty();
@@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const {
   return *entries == *other.entries;
 }
 
-bool LookaheadSet::contains(const Symbol::Index &symbol) const {
+bool LookaheadSet::contains(const Symbol &symbol) const {
   return entries->find(symbol) != entries->end();
 }
 
@@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) {
   if (!other.entries.get())
     return false;
   if (!entries.get())
-    entries = make_shared<set<Symbol::Index>>();
+    entries = make_shared<set<Symbol>>();
   size_t previous_size = entries->size();
   entries->insert(other.entries->begin(), other.entries->end());
   return entries->size() > previous_size;
 }
 
-bool LookaheadSet::insert(const Symbol::Index &symbol) {
+bool LookaheadSet::insert(const Symbol &symbol) {
   if (!entries.get())
-    entries = make_shared<set<Symbol::Index>>();
+    entries = make_shared<set<Symbol>>();
   return entries->insert(symbol).second;
 }
 
diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h
index fe99b4d5..e62ee34d 100644
--- a/src/compiler/build_tables/lookahead_set.h
+++ b/src/compiler/build_tables/lookahead_set.h
@@ -11,15 +11,15 @@ namespace build_tables {
 class LookaheadSet {
  public:
   LookaheadSet();
-  explicit LookaheadSet(const std::set<rules::Symbol::Index> &);
+  explicit LookaheadSet(const std::set<rules::Symbol> &);
 
   bool empty() const;
   bool operator==(const LookaheadSet &) const;
-  bool contains(const rules::Symbol::Index &) const;
+  bool contains(const rules::Symbol &) const;
   bool insert_all(const LookaheadSet &);
-  bool insert(const rules::Symbol::Index &);
+  bool insert(const rules::Symbol &);
 
-  std::shared_ptr<std::set<rules::Symbol::Index>> entries;
+  std::shared_ptr<std::set<rules::Symbol>> entries;
 };
 
 }  // namespace build_tables
diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc
index 39b131cb..b9c3831b 100644
--- a/src/compiler/build_tables/parse_item.cc
+++ b/src/compiler/build_tables/parse_item.cc
@@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const {
 }
 
 Symbol ParseItem::lhs() const {
-  return Symbol(variable_index);
+  return Symbol(variable_index, Symbol::NonTerminal);
 }
 
 bool ParseItem::is_done() const {
@@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const {
   return result;
 }
 
-ParseItemSet::ActionMap ParseItemSet::actions() const {
-  ParseItemSet::ActionMap result;
-
-  for (const auto &pair : entries) {
-    const ParseItem &item = pair.first;
-    const LookaheadSet &lookahead_symbols = pair.second;
-
-    if (item.step_index == item.production->size()) {
-      int precedence = item.precedence();
-      for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
-        Action &action = result.terminal_actions[lookahead];
-        if (precedence > action.completion_precedence) {
-          action.completions.assign({ &item });
-        } else if (precedence == action.completion_precedence) {
-          action.completions.push_back({ &item });
-        }
-      }
-    } else {
-      Symbol symbol = item.production->at(item.step_index).symbol;
-      ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
-
-      if (symbol.is_token) {
-        result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols;
-      } else {
-        result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols;
-      }
-    }
-  }
-
-  return result;
-}
-
 void ParseItemSet::add(const ParseItemSet &other) {
   for (const auto &pair : other.entries)
     entries[pair.first].insert_all(pair.second);
diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h
index a091ac9d..a3785638 100644
--- a/src/compiler/build_tables/parse_item.h
+++ b/src/compiler/build_tables/parse_item.h
@@ -41,16 +41,6 @@ class ParseItemSet {
   ParseItemSet();
   explicit ParseItemSet(const std::map<ParseItem, LookaheadSet> &);
 
-  struct Completion;
-  struct Action;
-
-  struct ActionMap {
-    std::map<rules::Symbol::Index, Action> terminal_actions;
-    std::map<rules::Symbol::Index, ParseItemSet> nonterminal_continuations;
-  };
-
-  ActionMap actions() const;
-
   bool operator==(const ParseItemSet &) const;
   void add(const ParseItemSet &);
   size_t unfinished_item_signature() const;
@@ -58,22 +48,6 @@ class ParseItemSet {
   std::map<ParseItem, LookaheadSet> entries;
 };
 
-struct ParseItemSet::Completion {
-  const ParseItem *item;
-  int precedence;
-  rules::Associativity associativity;
-
-  bool operator<(const ParseItemSet::Completion &other) {
-    return precedence < other.precedence;
-  }
-};
-
-struct ParseItemSet::Action {
-  ParseItemSet continuation;
-  std::vector<const ParseItem *> completions;
-  int completion_precedence;
-};
-
 }  // namespace build_tables
 }  // namespace tree_sitter
 
diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc
index 34b347fe..7e29efdf 100644
--- a/src/compiler/build_tables/parse_item_set_builder.cc
+++ b/src/compiler/build_tables/parse_item_set_builder.cc
@@ -27,12 +27,12 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
   set<Symbol::Index> processed_non_terminals;
 
   for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i, true);
-    first_sets.insert({symbol, LookaheadSet({ static_cast<Symbol::Index>(i) })});
+    Symbol symbol(i, Symbol::Terminal);
+    first_sets.insert({symbol, LookaheadSet({ symbol })});
   }
 
   for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i);
+    Symbol symbol(i, Symbol::NonTerminal);
     LookaheadSet first_set;
 
     processed_non_terminals.clear();
@@ -42,10 +42,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
       Symbol current_symbol = symbols_to_process.back();
       symbols_to_process.pop_back();
 
-      if (current_symbol.is_token) {
-        first_set.insert(current_symbol.index);
+      if (!current_symbol.is_non_terminal()) {
+        first_set.insert(current_symbol);
       } else if (processed_non_terminals.insert(current_symbol.index).second) {
-        for (const Production &production : grammar.productions(current_symbol)) {
+        for (const Production &production : grammar.variables[current_symbol.index].productions) {
           if (!production.empty()) {
             symbols_to_process.push_back(production[0].symbol);
           }
@@ -59,11 +59,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
   vector<ParseItemSetComponent> components_to_process;
 
   for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
-    Symbol symbol(i);
+    Symbol symbol(i, Symbol::NonTerminal);
     map<ParseItem, pair<LookaheadSet, bool>> cache_entry;
 
     components_to_process.clear();
-    for (const Production &production : grammar.productions(symbol)) {
+    for (const Production &production : grammar.variables[i].productions) {
       components_to_process.push_back(ParseItemSetComponent{
         ParseItem(symbol, production, 0),
         LookaheadSet(),
@@ -87,7 +87,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
 
       if (component_is_new) {
         Symbol next_symbol = item.next_symbol();
-        if (next_symbol.is_built_in() || next_symbol.is_token)
+        if (!next_symbol.is_non_terminal() || next_symbol.is_built_in())
           continue;
 
         LookaheadSet next_lookaheads;
@@ -102,7 +102,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
           propagates_lookaheads = false;
         }
 
-        for (const Production &production : grammar.productions(next_symbol)) {
+        for (const Production &production : grammar.variables[next_symbol.index].productions) {
           components_to_process.push_back(ParseItemSetComponent{
             ParseItem(next_symbol, production, 0),
             next_lookaheads,
@@ -130,7 +130,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
     const LookaheadSet &lookaheads = pair.second;
 
     const Symbol &next_symbol = item.next_symbol();
-    if (!next_symbol.is_token && !next_symbol.is_built_in()) {
+    if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) {
       LookaheadSet next_lookaheads;
       size_t next_step = item.step_index + 1;
       if (next_step == item.production->size()) {
diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc
index 479de6b8..84b175bc 100644
--- a/src/compiler/build_tables/recovery_tokens.cc
+++ b/src/compiler/build_tables/recovery_tokens.cc
@@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator<true, false> {};
 class LastCharacters : public CharacterAggregator<false, true> {};
 class AllCharacters : public CharacterAggregator<true, true> {};
 
-set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
-  set<Symbol::Index> result;
+set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
+  set<Symbol> result;
 
   AllCharacters all_separator_characters;
   for (const rule_ptr &separator : grammar.separators)
@@ -79,7 +79,7 @@ set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
       !all_characters.result.intersects(all_separator_characters.result);
 
     if ((has_distinct_start && has_distinct_end) || has_no_separators)
-      result.insert(i);
+      result.insert(Symbol(i, Symbol::Terminal));
   }
 
   return result;
diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h
index 4873b5a9..c97a8cfd 100644
--- a/src/compiler/build_tables/recovery_tokens.h
+++ b/src/compiler/build_tables/recovery_tokens.h
@@ -11,7 +11,7 @@ struct LexicalGrammar;
 
 namespace build_tables {
 
-std::set<rules::Symbol::Index> recovery_tokens(const LexicalGrammar &);
+std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
 
 }  // namespace build_tables
 }  // namespace tree_sitter
diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc
index b7058603..a5a9c17a 100644
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@@ -14,6 +14,7 @@
 
 namespace tree_sitter {
 namespace generate_code {
+
 using std::function;
 using std::map;
 using std::pair;
@@ -22,6 +23,7 @@ using std::string;
 using std::to_string;
 using std::vector;
 using util::escape_char;
+using rules::Symbol;
 
 static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
 
@@ -73,9 +75,8 @@ class CCodeGenerator {
   const LexicalGrammar lexical_grammar;
   map<string, string> sanitized_names;
   vector<pair<size_t, ParseTableEntry>> parse_table_entries;
-  vector<pair<size_t, set<rules::Symbol>>> in_progress_symbols;
+  vector<set<Symbol::Index>> external_token_id_sets;
   size_t next_parse_action_list_index;
-  size_t next_in_progress_symbol_list_index;
 
  public:
   CCodeGenerator(string name, const ParseTable &parse_table,
@@ -87,19 +88,25 @@ class CCodeGenerator {
         lex_table(lex_table),
         syntax_grammar(syntax_grammar),
         lexical_grammar(lexical_grammar),
-        next_parse_action_list_index(0),
-        next_in_progress_symbol_list_index(0) {}
+        next_parse_action_list_index(0) {}
 
   string code() {
     buffer = "";
 
     add_includes();
-    add_state_and_symbol_counts();
+    add_warning_pragma();
+    add_stats();
     add_symbol_enum();
     add_symbol_names_list();
-    add_symbol_node_types_list();
+    add_symbol_metadata_list();
     add_lex_function();
-    add_lex_states_list();
+    add_lex_modes_list();
+
+    if (!syntax_grammar.external_tokens.empty())
+      add_external_token_enum();
+
+    add_external_token_symbol_map();
+    add_external_scan_modes_list();
     add_parse_table();
     add_parser_export();
 
@@ -112,10 +119,17 @@ class CCodeGenerator {
     line();
   }
 
-  void add_state_and_symbol_counts() {
+  void add_warning_pragma() {
+    line("#pragma GCC diagnostic push");
+    line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
+    line();
+  }
+
+  void add_stats() {
     line("#define STATE_COUNT " + to_string(parse_table.states.size()));
     line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
     line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1));
+    line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size()));
     line();
   }
 
@@ -124,7 +138,7 @@ class CCodeGenerator {
     indent([&]() {
       size_t i = 1;
       for (const auto &entry : parse_table.symbols) {
-        const rules::Symbol &symbol = entry.first;
+        const Symbol &symbol = entry.first;
         if (!symbol.is_built_in()) {
           line(symbol_id(symbol) + " = " + to_string(i) + ",");
           i++;
@@ -146,11 +160,11 @@ class CCodeGenerator {
     line();
   }
 
-  void add_symbol_node_types_list() {
+  void add_symbol_metadata_list() {
     line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {");
     indent([&]() {
       for (const auto &entry : parse_table.symbols) {
-        const rules::Symbol &symbol = entry.first;
+        const Symbol &symbol = entry.first;
         line("[" + symbol_id(symbol) + "] = {");
         indent([&]() {
           switch (symbol_type(symbol)) {
@@ -198,13 +212,80 @@ class CCodeGenerator {
     line();
   }
 
-  void add_lex_states_list() {
-    line("static TSStateId ts_lex_states[STATE_COUNT] = {");
+  void add_lex_modes_list() {
+    add_external_tokens_id({});
+
+    line("static TSLexMode ts_lex_modes[STATE_COUNT] = {");
     indent([&]() {
       size_t state_id = 0;
-      for (const auto &state : parse_table.states)
-        line("[" + to_string(state_id++) + "] = " +
-             to_string(state.lex_state_id) + ",");
+
+      for (const auto &state : parse_table.states) {
+        line("[" + to_string(state_id++) + "] = {.lex_state = ");
+        add(to_string(state.lex_state_id));
+
+        set<Symbol::Index> external_token_indices;
+        for (const auto &pair : state.terminal_entries) {
+          Symbol symbol = pair.first;
+          if (symbol.is_external())
+            external_token_indices.insert(symbol.index);
+        }
+
+        if (!external_token_indices.empty())
+          add(", .external_tokens = " + add_external_tokens_id(external_token_indices));
+        add("},");
+      }
+    });
+    line("};");
+    line();
+  }
+
+  string add_external_tokens_id(set<Symbol::Index> external_token_ids) {
+    for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++)
+      if (external_token_id_sets[i] == external_token_ids)
+        return to_string(i);
+    external_token_id_sets.push_back(external_token_ids);
+    return to_string(external_token_id_sets.size() - 1);
+  }
+
+  void add_external_token_enum() {
+    line("enum {");
+    indent([&]() {
+      for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++)
+        line(external_token_id(i) + ",");
+    });
+    line("};");
+    line();
+  }
+
+  void add_external_token_symbol_map() {
+    line("TSSymbol ts_external_token_symbol_map[EXTERNAL_TOKEN_COUNT] = {");
+    indent([&]() {
+      for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
+        line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ",");
+      }
+    });
+    line("};");
+    line();
+  }
+
+  void add_external_scan_modes_list() {
+    line("static bool ts_external_token_lists[");
+    add(to_string(external_token_id_sets.size()));
+    add("][EXTERNAL_TOKEN_COUNT] = {");
+    indent([&]() {
+      size_t i = 0;
+      for (const auto &external_token_ids : external_token_id_sets) {
+        if (!external_token_ids.empty()) {
+          line("[" + to_string(i) + "] = {");
+          indent([&]() {
+            for (Symbol::Index id : external_token_ids) {
+              line("[" + external_token_id(id) + "] = true,");
+            }
+          });
+          line("},");
+        }
+        i++;
+      }
     });
     line("};");
     line();
@@ -214,9 +295,6 @@ class CCodeGenerator {
     add_parse_action_list_id(ParseTableEntry{ {}, false, false });
 
     size_t state_id = 0;
-    line("#pragma GCC diagnostic push");
-    line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
-    line();
     line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {");
 
     indent([&]() {
@@ -224,12 +302,12 @@ class CCodeGenerator {
         line("[" + to_string(state_id++) + "] = {");
         indent([&]() {
           for (const auto &entry : state.nonterminal_entries) {
-            line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE(");
+            line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE(");
             add(to_string(entry.second));
             add("),");
           }
           for (const auto &entry : state.terminal_entries) {
-            line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS(");
+            line("[" + symbol_id(entry.first) + "] = ACTIONS(");
             add(to_string(add_parse_action_list_id(entry.second)));
             add("),");
           }
@@ -242,12 +320,37 @@ class CCodeGenerator {
     line();
     add_parse_action_list();
     line();
-    line("#pragma GCC diagnostic pop");
-    line();
   }
 
   void add_parser_export() {
-    line("EXPORT_LANGUAGE(ts_language_" + name + ");");
+    if (!syntax_grammar.external_tokens.empty()) {
+      string external_scanner_name = "ts_language_" + name + "_external_scanner";
+
+      line("void *" + external_scanner_name + "_create();");
+      line("bool " + external_scanner_name + "_scan();");
+      line("void " + external_scanner_name + "_destroy();");
+      line();
+
+      line("const TSLanguage *ts_language_" + name + "() {");
+      indent([&]() {
+        if (!syntax_grammar.external_tokens.empty()) {
+          line("GET_LANGUAGE(");
+          indent([&]() {
+            line(external_scanner_name + "_create,");
+            line(external_scanner_name + "_scan,");
+            line(external_scanner_name + "_destroy,");
+          });
+          line(");");
+        }
+      });
+      line("}");
+    } else {
+      line("const TSLanguage *ts_language_" + name + "() {");
+      indent([&]() {
+        line("GET_LANGUAGE();");
+      });
+      line("}");
+    }
     line();
   }
 
@@ -379,22 +482,13 @@ class CCodeGenerator {
     return result;
   }
 
-  size_t add_in_progress_symbol_list_id(const set<rules::Symbol> &symbols) {
-    for (const auto &pair : in_progress_symbols) {
-      if (pair.second == symbols) {
-        return pair.first;
-      }
-    }
-
-    size_t result = next_in_progress_symbol_list_index;
-    in_progress_symbols.push_back({ result, symbols });
-    next_in_progress_symbol_list_index += 1 + symbols.size();
-    return result;
-  }
-
   // Helper functions
 
-  string symbol_id(const rules::Symbol &symbol) {
+  string external_token_id(Symbol::Index index) {
+    return "ts_external_token_" + syntax_grammar.external_tokens[index];
+  }
+
+  string symbol_id(const Symbol &symbol) {
     if (symbol == rules::END_OF_INPUT())
       return "ts_builtin_sym_end";
 
@@ -411,25 +505,31 @@ class CCodeGenerator {
     }
   }
 
-  string symbol_name(const rules::Symbol &symbol) {
+  string symbol_name(const Symbol &symbol) {
     if (symbol == rules::END_OF_INPUT())
       return "END";
     return entry_for_symbol(symbol).first;
   }
 
-  VariableType symbol_type(const rules::Symbol &symbol) {
+  VariableType symbol_type(const Symbol &symbol) {
     if (symbol == rules::END_OF_INPUT())
       return VariableTypeHidden;
     return entry_for_symbol(symbol).second;
   }
 
-  pair<string, VariableType> entry_for_symbol(const rules::Symbol &symbol) {
-    if (symbol.is_token) {
-      const Variable &variable = lexical_grammar.variables[symbol.index];
-      return { variable.name, variable.type };
-    } else {
-      const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
-      return { variable.name, variable.type };
+  pair<string, VariableType> entry_for_symbol(const Symbol &symbol) {
+    switch (symbol.type) {
+      case Symbol::NonTerminal: {
+        const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
+        return { variable.name, variable.type };
+      }
+      case Symbol::Terminal: {
+        const Variable &variable = lexical_grammar.variables[symbol.index];
+        return { variable.name, variable.type };
+      }
+      case Symbol::External: {
+        return { syntax_grammar.external_tokens[symbol.index], VariableTypeAnonymous };
+      }
     }
   }
 
diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h
index a8955c02..0a07280c 100644
--- a/src/compiler/grammar.h
+++ b/src/compiler/grammar.h
@@ -12,6 +12,7 @@ struct Grammar {
   std::vector<std::pair<std::string, rule_ptr>> rules;
   std::vector<rule_ptr> extra_tokens;
   std::vector<std::vector<std::string>> expected_conflicts;
+  std::vector<std::string> external_tokens;
 };
 
 }  // namespace tree_sitter
diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc
index 185d919b..cc5cff55 100644
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@@ -119,6 +119,16 @@ ParseRuleResult parse_rule(json_value *rule_json) {
     }
   }
 
+  if (type == "EXTERNAL_TOKEN") {
+    json_value token_name_json = rule_json->operator[]("name");
+    if (token_name_json.type != json_string) {
+      error_message = "External token name must be a string";
+      goto error;
+    }
+
+    return { external_token(token_name_json.u.string.ptr), "" };
+  }
+
   if (type == "PATTERN") {
     json_value value_json = rule_json->operator[]("value");
     if (value_json.type == json_string) {
@@ -210,7 +220,7 @@ ParseGrammarResult parse_grammar(const string &input) {
   string error_message;
   string name;
   Grammar grammar;
-  json_value name_json, rules_json, extras_json, conflicts_json;
+  json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json;
 
   json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
   char parse_error[json_error_max];
@@ -302,6 +312,25 @@ ParseGrammarResult parse_grammar(const string &input) {
     }
   }
 
+  external_tokens_json = grammar_json->operator[]("externals");
+  if (external_tokens_json.type != json_none) {
+    if (external_tokens_json.type != json_array) {
+      error_message = "External tokens must be an array";
+      goto error;
+    }
+
+    for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) {
+      json_value *token_name_json = external_tokens_json.u.array.values[i];
+      if (token_name_json->type != json_string) {
+        error_message = "External token values must be strings";
+        goto error;
+      }
+
+      string token_name = token_name_json->u.string.ptr;
+      grammar.external_tokens.push_back(token_name);
+    }
+  }
+
   json_value_free(grammar_json);
   return { name, grammar, "" };
 
diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc
index e6e4badd..a04eec8c 100644
--- a/src/compiler/parse_table.cc
+++ b/src/compiler/parse_table.cc
@@ -1,6 +1,7 @@
 #include "compiler/parse_table.h"
 #include <string>
 #include "compiler/precedence_range.h"
+#include "compiler/rules/built_in_symbols.h"
 
 namespace tree_sitter {
 
@@ -28,7 +29,7 @@ ParseAction::ParseAction()
       extra(false),
       fragile(false),
       state_index(-1),
-      symbol(Symbol(-1)),
+      symbol(rules::NONE()),
       consumed_symbol_count(0),
       production(nullptr) {}
 
@@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() {
 }
 
 ParseAction ParseAction::Shift(ParseStateId state_index) {
-  return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr);
+  return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr);
 }
 
 ParseAction ParseAction::Recover(ParseStateId state_index) {
-  return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0,
+  return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0,
                      nullptr);
 }
 
@@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const {
 set<Symbol> ParseState::expected_inputs() const {
   set<Symbol> result;
   for (auto &entry : terminal_entries)
-    result.insert(Symbol(entry.first, true));
-  for (auto &entry : nonterminal_entries)
-    result.insert(Symbol(entry.first, false));
+    result.insert(entry.first);
   return result;
 }
 
@@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() {
   return states.size() - 1;
 }
 
-ParseAction &ParseTable::set_terminal_action(ParseStateId state_id,
-                                             Symbol::Index index,
-                                             ParseAction action) {
-  states[state_id].terminal_entries[index].actions.clear();
-  return add_terminal_action(state_id, index, action);
-}
-
 ParseAction &ParseTable::add_terminal_action(ParseStateId state_id,
-                                             Symbol::Index index,
+                                             Symbol lookahead,
                                              ParseAction action) {
-  Symbol symbol(index, true);
   if (action.type == ParseActionTypeShift && action.extra)
-    symbols[symbol].extra = true;
+    symbols[lookahead].extra = true;
   else
-    symbols[symbol].structural = true;
+    symbols[lookahead].structural = true;
 
-  ParseTableEntry &entry = states[state_id].terminal_entries[index];
+  ParseTableEntry &entry = states[state_id].terminal_entries[lookahead];
   entry.actions.push_back(action);
   return *entry.actions.rbegin();
 }
 
 void ParseTable::set_nonterminal_action(ParseStateId state_id,
-                                        Symbol::Index index,
+                                        Symbol::Index lookahead,
                                         ParseStateId next_state_id) {
-  Symbol symbol(index, false);
-  symbols[symbol].structural = true;
-  states[state_id].nonterminal_entries[index] = next_state_id;
+  symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true;
+  states[state_id].nonterminal_entries[lookahead] = next_state_id;
 }
 
 static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
@@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) {
     return false;
 
   for (auto &entry : state.terminal_entries) {
-    Symbol::Index index = entry.first;
+    Symbol lookahead = entry.first;
     const vector<ParseAction> &actions = entry.second.actions;
 
-    const auto &other_entry = other.terminal_entries.find(index);
+    const auto &other_entry = other.terminal_entries.find(lookahead);
     if (other_entry == other.terminal_entries.end()) {
-      if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
+      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
         return false;
       if (actions.back().type != ParseActionTypeReduce)
         return false;
@@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) {
     }
   }
 
-  set<Symbol::Index> symbols_to_merge;
+  set<Symbol> symbols_to_merge;
 
   for (auto &entry : other.terminal_entries) {
-    Symbol::Index index = entry.first;
+    Symbol lookahead = entry.first;
     const vector<ParseAction> &actions = entry.second.actions;
 
-    if (!state.terminal_entries.count(index)) {
-      if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
+    if (!state.terminal_entries.count(lookahead)) {
+      if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
         return false;
       if (actions.back().type != ParseActionTypeReduce)
         return false;
       if (!has_entry(state, entry.second))
         return false;
-      symbols_to_merge.insert(index);
+      symbols_to_merge.insert(lookahead);
     }
   }
 
-  for (const Symbol::Index &index : symbols_to_merge)
-    state.terminal_entries[index] = other.terminal_entries.find(index)->second;
+  for (const Symbol &lookahead : symbols_to_merge)
+    state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
 
   return true;
 }
diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h
index 59eee4a8..79eec4fc 100644
--- a/src/compiler/parse_table.h
+++ b/src/compiler/parse_table.h
@@ -76,7 +76,7 @@ class ParseState {
   void each_referenced_state(std::function<void(ParseStateId *)>);
   bool has_shift_action() const;
 
-  std::map<rules::Symbol::Index, ParseTableEntry> terminal_entries;
+  std::map<rules::Symbol, ParseTableEntry> terminal_entries;
   std::map<rules::Symbol::Index, ParseStateId> nonterminal_entries;
   LexStateId lex_state_id;
   size_t shift_actions_signature;
@@ -91,15 +91,14 @@ class ParseTable {
  public:
   std::set<rules::Symbol> all_symbols() const;
   ParseStateId add_state();
-  ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction);
-  ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction);
-  void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId);
+  ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
+  void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
   bool merge_state(size_t i, size_t j);
 
   std::vector<ParseState> states;
   std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
 
-  std::set<rules::Symbol::Index> mergeable_symbols;
+  std::set<rules::Symbol> mergeable_symbols;
 };
 
 }  // namespace tree_sitter
diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc
index 7963e94b..331c9cea 100644
--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn {
     rule_ptr inner_rule = apply(rule->content);
     size_t index = aux_rules.size();
     string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count);
-    Symbol repeat_symbol(offset + index);
+    Symbol repeat_symbol(offset + index, Symbol::NonTerminal);
     existing_repeats.push_back({ rule->copy(), repeat_symbol });
     aux_rules.push_back(
       Variable(helper_rule_name, VariableTypeAuxiliary,
@@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
   result.variables = grammar.variables;
   result.extra_tokens = grammar.extra_tokens;
   result.expected_conflicts = grammar.expected_conflicts;
+  result.external_tokens = grammar.external_tokens;
 
   ExpandRepeats expander(result.variables.size());
   for (auto &variable : result.variables)
diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc
index bf7ac514..dcf88e53 100644
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@@ -11,6 +11,7 @@
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/string.h"
 #include "compiler/rules/metadata.h"
+#include "compiler/rules/external_token.h"
 #include "compiler/rules/pattern.h"
 #include "compiler/prepare_grammar/token_description.h"
 #include "compiler/prepare_grammar/is_token.h"
@@ -38,7 +39,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
   map<Symbol, Symbol> replacements;
 
   Symbol replace_symbol(const Symbol &symbol) {
-    if (symbol.is_built_in() || symbol.is_token)
+    if (!symbol.is_non_terminal())
       return symbol;
 
     auto replacement_pair = replacements.find(symbol);
@@ -49,7 +50,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
     for (const auto &pair : replacements)
       if (pair.first.index < symbol.index)
         new_index--;
-    return Symbol(new_index);
+    return Symbol(new_index, Symbol::NonTerminal);
   }
 };
 
@@ -60,14 +61,14 @@ class TokenExtractor : public rules::IdentityRuleFn {
     for (size_t i = 0; i < tokens.size(); i++)
       if (tokens[i].rule->operator==(*input)) {
         token_usage_counts[i]++;
-        return make_shared<Symbol>(i, true);
+        return make_shared<Symbol>(i, Symbol::Terminal);
       }
 
     rule_ptr rule = input->copy();
     size_t index = tokens.size();
     tokens.push_back(Variable(token_description(rule), entry_type, rule));
     token_usage_counts.push_back(1);
-    return make_shared<Symbol>(index, true);
+    return make_shared<Symbol>(index, Symbol::Terminal);
   }
 
   rule_ptr apply_to(const rules::String *rule) {
@@ -78,6 +79,10 @@ class TokenExtractor : public rules::IdentityRuleFn {
     return apply_to_token(rule, VariableTypeAuxiliary);
   }
 
+  rule_ptr apply_to(const rules::ExternalToken *rule) {
+    return apply_to_token(rule, VariableTypeAuxiliary);
+  }
+
   rule_ptr apply_to(const rules::Metadata *rule) {
     if (rule->params.is_token)
       return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
@@ -90,7 +95,7 @@ class TokenExtractor : public rules::IdentityRuleFn {
   vector<Variable> tokens;
 };
 
-static CompileError ubiq_token_err(const string &message) {
+static CompileError extra_token_error(const string &message) {
   return CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
                       "Not a token: " + message);
 }
@@ -122,11 +127,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
   size_t i = 0;
   for (const Variable &variable : processed_variables) {
     auto symbol = variable.rule->as<Symbol>();
-    if (symbol && symbol->is_token && !symbol->is_built_in() &&
-        extractor.token_usage_counts[symbol->index] == 1) {
+    if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) {
       lexical_grammar.variables[symbol->index].type = variable.type;
       lexical_grammar.variables[symbol->index].name = variable.name;
-      symbol_replacer.replacements.insert({ Symbol(i), *symbol });
+      symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol });
     } else {
       syntax_grammar.variables.push_back(variable);
     }
@@ -158,7 +162,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
     bool used_elsewhere_in_grammar = false;
     for (const Variable &variable : lexical_grammar.variables) {
       if (variable.rule->operator==(*rule)) {
-        syntax_grammar.extra_tokens.insert(Symbol(i, true));
+        syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
         used_elsewhere_in_grammar = true;
       }
       i++;
@@ -175,17 +179,20 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
     auto symbol = rule->as<Symbol>();
     if (!symbol)
       return make_tuple(syntax_grammar, lexical_grammar,
-                        ubiq_token_err(rule->to_string()));
+                        extra_token_error(rule->to_string()));
 
     Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
-    if (!new_symbol.is_token)
+    if (!new_symbol.is_token()) {
       return make_tuple(
         syntax_grammar, lexical_grammar,
-        ubiq_token_err(syntax_grammar.variables[new_symbol.index].name));
+        extra_token_error(syntax_grammar.variables[new_symbol.index].name));
+    }
 
     syntax_grammar.extra_tokens.insert(new_symbol);
   }
 
+  syntax_grammar.external_tokens = grammar.external_tokens;
+
   return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());
 }
 
diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc
index ddba9a5f..8ac0e33c 100644
--- a/src/compiler/prepare_grammar/flatten_grammar.cc
+++ b/src/compiler/prepare_grammar/flatten_grammar.cc
@@ -92,6 +92,7 @@ pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &gr
   SyntaxGrammar result;
   result.expected_conflicts = grammar.expected_conflicts;
   result.extra_tokens = grammar.extra_tokens;
+  result.external_tokens = grammar.external_tokens;
 
   bool is_start = true;
   for (const Variable &variable : grammar.variables) {
diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h
index fe1ff37d..d4b1c8d5 100644
--- a/src/compiler/prepare_grammar/initial_syntax_grammar.h
+++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h
@@ -1,13 +1,12 @@
 #ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_
 #define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_
 
-#include <vector>
-#include <string>
 #include <set>
+#include <vector>
 #include "tree_sitter/compiler.h"
 #include "compiler/rules/symbol.h"
-#include "compiler/variable.h"
 #include "compiler/syntax_grammar.h"
+#include "compiler/variable.h"
 
 namespace tree_sitter {
 namespace prepare_grammar {
@@ -16,6 +15,7 @@ struct InitialSyntaxGrammar {
   std::vector<Variable> variables;
   std::set<rules::Symbol> extra_tokens;
   std::set<ConflictSet> expected_conflicts;
+  std::vector<std::string> external_tokens;
 };
 
 }  // namespace prepare_grammar
diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc
index cd01719c..f08edf5e 100644
--- a/src/compiler/prepare_grammar/intern_symbols.cc
+++ b/src/compiler/prepare_grammar/intern_symbols.cc
@@ -7,6 +7,7 @@
 #include "compiler/rules/visitor.h"
 #include "compiler/rules/blank.h"
 #include "compiler/rules/named_symbol.h"
+#include "compiler/rules/external_token.h"
 #include "compiler/rules/symbol.h"
 
 namespace tree_sitter {
@@ -17,6 +18,7 @@ using std::vector;
 using std::set;
 using std::pair;
 using std::make_shared;
+using rules::Symbol;
 
 class InternSymbols : public rules::IdentityRuleFn {
   using rules::IdentityRuleFn::apply_to;
@@ -30,17 +32,34 @@ class InternSymbols : public rules::IdentityRuleFn {
     return result;
   }
 
+  rule_ptr apply_to(const rules::ExternalToken *rule) {
+    auto result = symbol_for_external_token(rule->name);
+    if (!result.get()) {
+      missing_external_token_name = rule->name;
+      return rules::Blank::build();
+    }
+    return result;
+  }
+
  public:
   std::shared_ptr<rules::Symbol> symbol_for_rule_name(string rule_name) {
     for (size_t i = 0; i < grammar.rules.size(); i++)
       if (grammar.rules[i].first == rule_name)
-        return make_shared<rules::Symbol>(i);
+        return make_shared<Symbol>(i, Symbol::NonTerminal);
+    return nullptr;
+  }
+
+  std::shared_ptr<rules::Symbol> symbol_for_external_token(string name) {
+    for (size_t i = 0; i < grammar.external_tokens.size(); i++)
+      if (grammar.external_tokens[i] == name)
+        return make_shared<rules::Symbol>(i, Symbol::External);
     return nullptr;
   }
 
   explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {}
   const Grammar grammar;
   string missing_rule_name;
+  string missing_external_token_name;
 };
 
 CompileError missing_rule_error(string rule_name) {
@@ -48,14 +67,22 @@ CompileError missing_rule_error(string rule_name) {
                       "Undefined rule '" + rule_name + "'");
 }
 
+CompileError missing_external_token_error(string token_name) {
+  return CompileError(TSCompileErrorTypeUndefinedSymbol,
+                      "Undefined external token '" + token_name + "'");
+}
+
 pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
   InternedGrammar result;
+  result.external_tokens = grammar.external_tokens;
   InternSymbols interner(grammar);
 
   for (auto &pair : grammar.rules) {
     auto new_rule = interner.apply(pair.second);
     if (!interner.missing_rule_name.empty())
       return { result, missing_rule_error(interner.missing_rule_name) };
+    if (!interner.missing_external_token_name.empty())
+      return { result, missing_external_token_error(interner.missing_external_token_name) };
 
     result.variables.push_back(Variable(
       pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
@@ -66,6 +93,8 @@ pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
     auto new_rule = interner.apply(rule);
     if (!interner.missing_rule_name.empty())
       return { result, missing_rule_error(interner.missing_rule_name) };
+    if (!interner.missing_external_token_name.empty())
+      return { result, missing_external_token_error(interner.missing_external_token_name) };
     result.extra_tokens.push_back(new_rule);
   }
 
diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h
index c08c07dd..7b425c3a 100644
--- a/src/compiler/prepare_grammar/interned_grammar.h
+++ b/src/compiler/prepare_grammar/interned_grammar.h
@@ -15,6 +15,7 @@ struct InternedGrammar {
   std::vector<Variable> variables;
   std::vector<rule_ptr> extra_tokens;
   std::set<ConflictSet> expected_conflicts;
+  std::vector<std::string> external_tokens;
 };
 
 }  // namespace prepare_grammar
diff --git a/src/compiler/rules.h b/src/compiler/rules.h
index d98a719a..8a3f4097 100644
--- a/src/compiler/rules.h
+++ b/src/compiler/rules.h
@@ -22,6 +22,7 @@ rule_ptr prec_left(int precedence, const rule_ptr &);
 rule_ptr prec_right(const rule_ptr &);
 rule_ptr prec_right(int precedence, const rule_ptr &);
 rule_ptr token(const rule_ptr &rule);
+rule_ptr external_token(const std::string &);
 
 }  // namespace std
 
diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc
index a7a877ec..b3f7cd66 100644
--- a/src/compiler/rules/built_in_symbols.cc
+++ b/src/compiler/rules/built_in_symbols.cc
@@ -4,15 +4,15 @@ namespace tree_sitter {
 namespace rules {
 
 Symbol END_OF_INPUT() {
-  return Symbol(-1, true);
+  return Symbol(-1, Symbol::Terminal);
 }
 
 Symbol START() {
-  return Symbol(-2);
+  return Symbol(-2, Symbol::NonTerminal);
 }
 
 Symbol NONE() {
-  return Symbol(-3);
+  return Symbol(-3, Symbol::NonTerminal);
 }
 
 }  // namespace rules
diff --git a/src/compiler/rules/external_token.cc b/src/compiler/rules/external_token.cc
new file mode 100644
index 00000000..d8487b0e
--- /dev/null
+++ b/src/compiler/rules/external_token.cc
@@ -0,0 +1,39 @@
+#include "compiler/rules/external_token.h"
+#include <string>
+#include "compiler/rules/visitor.h"
+
+namespace tree_sitter {
+namespace rules {
+
+using std::string;
+using std::hash;
+
+ExternalToken::ExternalToken(const string &name) : name(name) {}
+
+rule_ptr ExternalToken::build(const string &name) {
+  return std::make_shared<ExternalToken>(name);
+}
+
+bool ExternalToken::operator==(const Rule &rule) const {
+  auto other = rule.as<ExternalToken>();
+  return other && other->name == name;
+}
+
+size_t ExternalToken::hash_code() const {
+  return hash<string>()(name);
+}
+
+rule_ptr ExternalToken::copy() const {
+  return std::make_shared<ExternalToken>(*this);
+}
+
+string ExternalToken::to_string() const {
+  return string("(sym '") + name + "')";
+}
+
+void ExternalToken::accept(Visitor *visitor) const {
+  visitor->visit(this);
+}
+
+}  // namespace rules
+}  // namespace tree_sitter
diff --git a/src/compiler/rules/external_token.h b/src/compiler/rules/external_token.h
new file mode 100644
index 00000000..cec1a847
--- /dev/null
+++ b/src/compiler/rules/external_token.h
@@ -0,0 +1,27 @@
+#ifndef COMPILER_RULES_EXTERNAL_TOKEN_H_
+#define COMPILER_RULES_EXTERNAL_TOKEN_H_
+
+#include <string>
+#include "compiler/rule.h"
+
+namespace tree_sitter {
+namespace rules {
+
+class ExternalToken : public Rule {
+ public:
+  explicit ExternalToken(const std::string &);
+  static rule_ptr build(const std::string &);
+
+  bool operator==(const Rule &other) const;
+  size_t hash_code() const;
+  rule_ptr copy() const;
+  std::string to_string() const;
+  void accept(Visitor *visitor) const;
+
+  std::string name;
+};
+
+}  // namespace rules
+}  // namespace tree_sitter
+
+#endif  // COMPILER_RULES_EXTERNAL_TOKEN_H_
diff --git a/src/compiler/rules/rules.cc b/src/compiler/rules/rules.cc
index fdb0ebdf..73c37284 100644
--- a/src/compiler/rules/rules.cc
+++ b/src/compiler/rules/rules.cc
@@ -13,6 +13,7 @@
 #include "compiler/rules/pattern.h"
 #include "compiler/rules/character_set.h"
 #include "compiler/rules/repeat.h"
+#include "compiler/rules/external_token.h"
 #include "compiler/rules/built_in_symbols.h"
 
 namespace tree_sitter {
@@ -105,4 +106,8 @@ rule_ptr token(const rule_ptr &rule) {
   return metadata(rule, params);
 }
 
+rule_ptr external_token(const string &name) {
+  return rules::ExternalToken::build(name);
+}
+
 }  // namespace tree_sitter
diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc
index f85b09c7..478de7cf 100644
--- a/src/compiler/rules/symbol.cc
+++ b/src/compiler/rules/symbol.cc
@@ -11,12 +11,10 @@ using std::string;
 using std::to_string;
 using util::hash_combine;
 
-Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {}
-
-Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {}
+Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {}
 
 bool Symbol::operator==(const Symbol &other) const {
-  return (other.index == index) && (other.is_token == is_token);
+  return (other.index == index) && (other.type == type);
 }
 
 bool Symbol::operator==(const Rule &rule) const {
@@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const {
 size_t Symbol::hash_code() const {
   size_t result = 0;
   hash_combine(&result, index);
-  hash_combine(&result, is_token);
+  hash_combine<int>(&result, type);
   return result;
 }
 
@@ -36,14 +34,20 @@ rule_ptr Symbol::copy() const {
 }
 
 string Symbol::to_string() const {
-  string name = is_token ? "token" : "sym";
-  return "(" + name + " " + std::to_string(index) + ")";
+  switch (type) {
+    case Symbol::Terminal:
+      return "(terminal " + std::to_string(index) + ")";
+    case Symbol::NonTerminal:
+      return "(non-terminal " + std::to_string(index) + ")";
+    case Symbol::External:
+      return "(external " + std::to_string(index) + ")";
+  }
 }
 
 bool Symbol::operator<(const Symbol &other) const {
-  if (is_token && !other.is_token)
+  if (type < other.type)
     return true;
-  if (!is_token && other.is_token)
+  if (other.type < type)
     return false;
   return (index < other.index);
 }
@@ -56,6 +60,18 @@ bool Symbol::is_built_in() const {
   return is_built_in(index);
 }
 
+bool Symbol::is_token() const {
+  return type == Symbol::Terminal;
+}
+
+bool Symbol::is_external() const {
+  return type == Symbol::External;
+}
+
+bool Symbol::is_non_terminal() const {
+  return type == Symbol::NonTerminal;
+}
+
 void Symbol::accept(Visitor *visitor) const {
   visitor->visit(this);
 }
diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h
index 4ae9ece3..46272dc5 100644
--- a/src/compiler/rules/symbol.h
+++ b/src/compiler/rules/symbol.h
@@ -11,9 +11,13 @@ class Symbol : public Rule {
  public:
   typedef int Index;
 
+  typedef enum {
+    Terminal,
+    NonTerminal,
+    External,
+  } Type;
 
-  explicit Symbol(Index index);
-  Symbol(Index index, bool is_token);
+  Symbol(Index index, Type type);
 
   bool operator==(const Symbol &other) const;
   bool operator==(const Rule &other) const;
@@ -26,9 +30,12 @@ class Symbol : public Rule {
   bool operator<(const Symbol &other) const;
   static bool is_built_in(Index);
   bool is_built_in() const;
+  bool is_token() const;
+  bool is_external() const;
+  bool is_non_terminal() const;
 
   Index index;
-  bool is_token;
+  Type type;
 };
 
 }  // namespace rules
diff --git a/src/compiler/rules/visitor.h b/src/compiler/rules/visitor.h
index b8301183..c75e31dc 100644
--- a/src/compiler/rules/visitor.h
+++ b/src/compiler/rules/visitor.h
@@ -16,6 +16,7 @@ class String;
 class Symbol;
 class Pattern;
 class Metadata;
+class ExternalToken;
 
 class Visitor {
  public:
@@ -29,6 +30,7 @@ class Visitor {
   virtual void visit(const String *rule) = 0;
   virtual void visit(const NamedSymbol *rule) = 0;
   virtual void visit(const Symbol *rule) = 0;
+  virtual void visit(const ExternalToken *rule) = 0;
   virtual ~Visitor();
 };
 
@@ -86,6 +88,10 @@ class RuleFn : private Visitor {
     return default_apply((const Rule *)rule);
   }
 
+  virtual T apply_to(const ExternalToken *rule) {
+    return default_apply((const Rule *)rule);
+  }
+
   void visit(const Blank *rule) {
     value_ = apply_to(rule);
   }
@@ -126,6 +132,10 @@ class RuleFn : private Visitor {
     value_ = apply_to(rule);
   }
 
+  void visit(const ExternalToken *rule) {
+    value_ = apply_to(rule);
+  }
+
  private:
   T value_;
 };
@@ -170,6 +180,9 @@ class RuleFn<void> : private Visitor {
   virtual void apply_to(const Symbol *rule) {
     return default_apply((const Rule *)rule);
   }
+  virtual void apply_to(const ExternalToken *rule) {
+    return default_apply((const Rule *)rule);
+  }
 
   void visit(const Blank *rule) {
     apply_to(rule);
@@ -201,6 +214,9 @@ class RuleFn<void> : private Visitor {
   void visit(const Symbol *rule) {
     apply_to(rule);
   }
+  void visit(const ExternalToken *rule) {
+    apply_to(rule);
+  }
 };
 
 class IdentityRuleFn : public RuleFn<rule_ptr> {
diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc
index 706ec828..535ddcda 100644
--- a/src/compiler/syntax_grammar.cc
+++ b/src/compiler/syntax_grammar.cc
@@ -13,8 +13,6 @@ using std::pair;
 using std::vector;
 using std::set;
 
-static const vector<Production> NO_PRODUCTIONS;
-
 SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
                                const vector<Production> &productions)
     : name(name), productions(productions), type(type) {}
@@ -28,13 +26,4 @@ bool ProductionStep::operator==(const ProductionStep &other) const {
          associativity == other.associativity;
 }
 
-const vector<Production> &SyntaxGrammar::productions(
-  const rules::Symbol &symbol) const {
-  if (symbol.is_built_in() || symbol.is_token) {
-    return NO_PRODUCTIONS;
-  } else {
-    return variables[symbol.index].productions;
-  }
-}
-
 }  // namespace tree_sitter
diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h
index 89745fa5..e34ddbbe 100644
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@@ -33,11 +33,10 @@ struct SyntaxVariable {
 typedef std::set<rules::Symbol> ConflictSet;
 
 struct SyntaxGrammar {
-  const std::vector<Production> &productions(const rules::Symbol &) const;
-
   std::vector<SyntaxVariable> variables;
   std::set<rules::Symbol> extra_tokens;
   std::set<ConflictSet> expected_conflicts;
+  std::vector<std::string> external_tokens;
 };
 
 }  // namespace tree_sitter
diff --git a/src/runtime/parser.c b/src/runtime/parser.c
index 2f5879a4..c37b7871 100644
--- a/src/runtime/parser.c
+++ b/src/runtime/parser.c
@@ -161,7 +161,7 @@ static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) {
 
 static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree,
                               TableEntry *table_entry) {
-  if (tree->first_leaf.lex_state == self->language->lex_states[state])
+  if (tree->first_leaf.lex_state == self->language->lex_modes[state].lex_state)
     return true;
   if (!table_entry->is_reusable)
     return false;
@@ -209,7 +209,7 @@ static bool parser__condense_stack(Parser *self) {
 }
 
 static Tree *parser__lex(Parser *self, TSStateId parse_state) {
-  TSStateId start_state = self->language->lex_states[parse_state];
+  TSStateId start_state = self->language->lex_modes[parse_state].lex_state;
   TSStateId current_state = start_state;
   Length start_position = self->lexer.current_position;
   LOG("lex state:%d", start_state);
@@ -729,6 +729,9 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) {
     LOG("new_parse");
   }
 
+  if (self->language->external_scanner.create)
+    self->language->external_scanner.create();
+
   ts_lexer_set_input(&self->lexer, input);
   ts_stack_clear(self->stack);
   self->reusable_node = (ReusableNode){ previous_tree, 0 };