Expand regex/string rules as part of grammar preparation

This makes it possible to report errors in regex parsing
2014-05-19 20:54:59 -07:00 · 2014-05-19 20:54:59 -07:00 · 649f200831
commit 649f200831
parent 5245bc01fe
26 changed files with 883 additions and 651 deletions
--- a/examples/grammars/json.cc
+++ b/examples/grammars/json.cc
@ -19,7 +19,7 @@ namespace tree_sitter_examples {
            str(":"),
            sym("value") })))) },
        { "array", in_brackets(comma_sep(err(sym("value")))) },
-        { "string", pattern("\"([^\"]|\\\\\")+\"") },
+        { "string", pattern("\"([^\"]|\\\\\")*\"") },
        { "number", pattern("\\d+(\\.\\d+)?") },
        { "null", keyword("null") },
        { "true", keyword("true") },
--- a/examples/parsers/arithmetic.c
+++ b/examples/parsers/arithmetic.c
@ -34,13 +34,13 @@ SYMBOL_NAMES = {
    [ts_builtin_sym_end] = "end",
    [ts_sym_number] = "number",
    [ts_sym_variable] = "variable",
-    [ts_aux_sym_token0] = "'+'",
-    [ts_aux_sym_token1] = "'-'",
-    [ts_aux_sym_token2] = "'*'",
-    [ts_aux_sym_token3] = "'/'",
-    [ts_aux_sym_token4] = "'^'",
-    [ts_aux_sym_token5] = "'('",
-    [ts_aux_sym_token6] = "')'",
+    [ts_aux_sym_token0] = "",
+    [ts_aux_sym_token1] = "",
+    [ts_aux_sym_token2] = "",
+    [ts_aux_sym_token3] = "",
+    [ts_aux_sym_token4] = "",
+    [ts_aux_sym_token5] = "",
+    [ts_aux_sym_token6] = "",
 };

 UBIQUITOUS_SYMBOLS = {
--- a/examples/parsers/golang.c
+++ b/examples/parsers/golang.c
@ -109,34 +109,34 @@ SYMBOL_NAMES = {
    [ts_aux_sym__func_signature_repeat2] = "_func_signature_repeat2",
    [ts_aux_sym__func_signature_repeat3] = "_func_signature_repeat3",
    [ts_aux_sym__func_signature_repeat4] = "_func_signature_repeat4",
-    [ts_aux_sym_token0] = "'package'",
-    [ts_aux_sym_token1] = "'import'",
-    [ts_aux_sym_token2] = "'('",
-    [ts_aux_sym_token3] = "')'",
-    [ts_aux_sym_token4] = "'type'",
-    [ts_aux_sym_token5] = "'var'",
-    [ts_aux_sym_token6] = "'='",
-    [ts_aux_sym_token7] = "'func'",
-    [ts_aux_sym_token8] = "'{'",
-    [ts_aux_sym_token9] = "'}'",
-    [ts_aux_sym_token10] = "'*'",
-    [ts_aux_sym_token11] = "'map'",
-    [ts_aux_sym_token12] = "'['",
-    [ts_aux_sym_token13] = "']'",
-    [ts_aux_sym_token14] = "'struct'",
-    [ts_aux_sym_token15] = "'interface'",
-    [ts_aux_sym_token16] = "'/'",
-    [ts_aux_sym_token17] = "'+'",
-    [ts_aux_sym_token18] = "'-'",
-    [ts_aux_sym_token19] = "'||'",
-    [ts_aux_sym_token20] = "'&&'",
-    [ts_aux_sym_token21] = "'=='",
-    [ts_aux_sym_token22] = "'<='",
-    [ts_aux_sym_token23] = "'<'",
-    [ts_aux_sym_token24] = "'>='",
-    [ts_aux_sym_token25] = "'>'",
-    [ts_aux_sym_token26] = "'!'",
-    [ts_aux_sym_token27] = "','",
+    [ts_aux_sym_token0] = "",
+    [ts_aux_sym_token1] = "",
+    [ts_aux_sym_token2] = "",
+    [ts_aux_sym_token3] = "",
+    [ts_aux_sym_token4] = "",
+    [ts_aux_sym_token5] = "",
+    [ts_aux_sym_token6] = "",
+    [ts_aux_sym_token7] = "",
+    [ts_aux_sym_token8] = "",
+    [ts_aux_sym_token9] = "",
+    [ts_aux_sym_token10] = "",
+    [ts_aux_sym_token11] = "",
+    [ts_aux_sym_token12] = "",
+    [ts_aux_sym_token13] = "",
+    [ts_aux_sym_token14] = "",
+    [ts_aux_sym_token15] = "",
+    [ts_aux_sym_token16] = "",
+    [ts_aux_sym_token17] = "",
+    [ts_aux_sym_token18] = "",
+    [ts_aux_sym_token19] = "",
+    [ts_aux_sym_token20] = "",
+    [ts_aux_sym_token21] = "",
+    [ts_aux_sym_token22] = "",
+    [ts_aux_sym_token23] = "",
+    [ts_aux_sym_token24] = "",
+    [ts_aux_sym_token25] = "",
+    [ts_aux_sym_token26] = "",
+    [ts_aux_sym_token27] = "",
 };

 UBIQUITOUS_SYMBOLS = {
--- a/examples/parsers/javascript.c
+++ b/examples/parsers/javascript.c
@ -143,52 +143,52 @@ SYMBOL_NAMES = {
    [ts_aux_sym_formal_parameters_repeat0] = "formal_parameters_repeat0",
    [ts_aux_sym_object_repeat0] = "object_repeat0",
    [ts_aux_sym_array_repeat0] = "array_repeat0",
-    [ts_aux_sym_token0] = "'{'",
-    [ts_aux_sym_token1] = "'}'",
-    [ts_aux_sym_token2] = "'for'",
-    [ts_aux_sym_token3] = "'('",
-    [ts_aux_sym_token4] = "')'",
-    [ts_aux_sym_token5] = "'if'",
-    [ts_aux_sym_token6] = "'else'",
-    [ts_aux_sym_token7] = "'while'",
-    [ts_aux_sym_token8] = "'try'",
-    [ts_aux_sym_token9] = "'catch'",
-    [ts_aux_sym_token10] = "'switch'",
-    [ts_aux_sym_token11] = "'case'",
-    [ts_aux_sym_token12] = "'default'",
-    [ts_aux_sym_token13] = "':'",
-    [ts_aux_sym_token14] = "'break'",
-    [ts_aux_sym_token15] = "'var'",
-    [ts_aux_sym_token16] = "','",
-    [ts_aux_sym_token17] = "'return'",
-    [ts_aux_sym_token18] = "'delete'",
-    [ts_aux_sym_token19] = "'++'",
-    [ts_aux_sym_token20] = "'--'",
-    [ts_aux_sym_token21] = "'+'",
-    [ts_aux_sym_token22] = "'-'",
-    [ts_aux_sym_token23] = "'*'",
-    [ts_aux_sym_token24] = "'/'",
-    [ts_aux_sym_token25] = "'&'",
-    [ts_aux_sym_token26] = "'|'",
-    [ts_aux_sym_token27] = "'^'",
-    [ts_aux_sym_token28] = "'||'",
-    [ts_aux_sym_token29] = "'&&'",
-    [ts_aux_sym_token30] = "'==='",
-    [ts_aux_sym_token31] = "'=='",
-    [ts_aux_sym_token32] = "'!=='",
-    [ts_aux_sym_token33] = "'!='",
-    [ts_aux_sym_token34] = "'<='",
-    [ts_aux_sym_token35] = "'<'",
-    [ts_aux_sym_token36] = "'>='",
-    [ts_aux_sym_token37] = "'>'",
-    [ts_aux_sym_token38] = "'!'",
-    [ts_aux_sym_token39] = "'?'",
-    [ts_aux_sym_token40] = "'='",
-    [ts_aux_sym_token41] = "'function'",
-    [ts_aux_sym_token42] = "'new'",
-    [ts_aux_sym_token43] = "'.'",
-    [ts_aux_sym_token44] = "'['",
-    [ts_aux_sym_token45] = "']'",
+    [ts_aux_sym_token0] = "",
+    [ts_aux_sym_token1] = "",
+    [ts_aux_sym_token2] = "",
+    [ts_aux_sym_token3] = "",
+    [ts_aux_sym_token4] = "",
+    [ts_aux_sym_token5] = "",
+    [ts_aux_sym_token6] = "",
+    [ts_aux_sym_token7] = "",
+    [ts_aux_sym_token8] = "",
+    [ts_aux_sym_token9] = "",
+    [ts_aux_sym_token10] = "",
+    [ts_aux_sym_token11] = "",
+    [ts_aux_sym_token12] = "",
+    [ts_aux_sym_token13] = "",
+    [ts_aux_sym_token14] = "",
+    [ts_aux_sym_token15] = "",
+    [ts_aux_sym_token16] = "",
+    [ts_aux_sym_token17] = "",
+    [ts_aux_sym_token18] = "",
+    [ts_aux_sym_token19] = "",
+    [ts_aux_sym_token20] = "",
+    [ts_aux_sym_token21] = "",
+    [ts_aux_sym_token22] = "",
+    [ts_aux_sym_token23] = "",
+    [ts_aux_sym_token24] = "",
+    [ts_aux_sym_token25] = "",
+    [ts_aux_sym_token26] = "",
+    [ts_aux_sym_token27] = "",
+    [ts_aux_sym_token28] = "",
+    [ts_aux_sym_token29] = "",
+    [ts_aux_sym_token30] = "",
+    [ts_aux_sym_token31] = "",
+    [ts_aux_sym_token32] = "",
+    [ts_aux_sym_token33] = "",
+    [ts_aux_sym_token34] = "",
+    [ts_aux_sym_token35] = "",
+    [ts_aux_sym_token36] = "",
+    [ts_aux_sym_token37] = "",
+    [ts_aux_sym_token38] = "",
+    [ts_aux_sym_token39] = "",
+    [ts_aux_sym_token40] = "",
+    [ts_aux_sym_token41] = "",
+    [ts_aux_sym_token42] = "",
+    [ts_aux_sym_token43] = "",
+    [ts_aux_sym_token44] = "",
+    [ts_aux_sym_token45] = "",
 };

 UBIQUITOUS_SYMBOLS = {
--- a/examples/parsers/json.c
+++ b/examples/parsers/json.c
@ -35,12 +35,12 @@ SYMBOL_NAMES = {
    [ts_sym_false] = "false",
    [ts_aux_sym_object_repeat0] = "object_repeat0",
    [ts_aux_sym_array_repeat0] = "array_repeat0",
-    [ts_aux_sym_token0] = "'{'",
-    [ts_aux_sym_token1] = "':'",
-    [ts_aux_sym_token2] = "','",
-    [ts_aux_sym_token3] = "'}'",
-    [ts_aux_sym_token4] = "'['",
-    [ts_aux_sym_token5] = "']'",
+    [ts_aux_sym_token0] = "",
+    [ts_aux_sym_token1] = "",
+    [ts_aux_sym_token2] = "",
+    [ts_aux_sym_token3] = "",
+    [ts_aux_sym_token4] = "",
+    [ts_aux_sym_token5] = "",
 };

 UBIQUITOUS_SYMBOLS = {
@ -69,90 +69,87 @@ LEX_FN() {
            if (lookahead == '\"')
                ADVANCE(2);
            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(7);
+                ADVANCE(6);
            if (lookahead == '[')
-                ADVANCE(10);
+                ADVANCE(9);
            if (lookahead == 'f')
-                ADVANCE(11);
+                ADVANCE(10);
            if (lookahead == 'n')
-                ADVANCE(16);
+                ADVANCE(15);
            if (lookahead == 't')
-                ADVANCE(20);
+                ADVANCE(19);
            if (lookahead == '{')
-                ADVANCE(24);
+                ADVANCE(23);
            LEX_ERROR();
        case 2:
            if (!((lookahead == '\"') ||
                (lookahead == '\\')))
+                ADVANCE(2);
+            if (lookahead == '\"')
                ADVANCE(3);
            if (lookahead == '\\')
-                ADVANCE(5);
+                ADVANCE(4);
            LEX_ERROR();
        case 3:
+            ACCEPT_TOKEN(ts_sym_string);
+        case 4:
            if (!((lookahead == '\"') ||
                (lookahead == '\\')))
-                ADVANCE(3);
+                ADVANCE(2);
            if (lookahead == '\"')
-                ADVANCE(4);
-            if (lookahead == '\\')
                ADVANCE(5);
+            if (lookahead == '\\')
+                ADVANCE(4);
            LEX_ERROR();
-        case 4:
-            ACCEPT_TOKEN(ts_sym_string);
        case 5:
            if (!((lookahead == '\"') ||
                (lookahead == '\\')))
-                ADVANCE(3);
+                ADVANCE(2);
            if (lookahead == '\"')
-                ADVANCE(6);
+                ADVANCE(3);
            if (lookahead == '\\')
-                ADVANCE(5);
-            LEX_ERROR();
-        case 6:
-            if (!((lookahead == '\"') ||
-                (lookahead == '\\')))
-                ADVANCE(3);
-            if (lookahead == '\"')
                ADVANCE(4);
-            if (lookahead == '\\')
-                ADVANCE(5);
            ACCEPT_TOKEN(ts_sym_string);
-        case 7:
+        case 6:
            if (lookahead == '.')
-                ADVANCE(8);
-            if ('0' <= lookahead && lookahead <= '9')
                ADVANCE(7);
+            if ('0' <= lookahead && lookahead <= '9')
+                ADVANCE(6);
            ACCEPT_TOKEN(ts_sym_number);
+        case 7:
+            if ('0' <= lookahead && lookahead <= '9')
+                ADVANCE(8);
+            LEX_ERROR();
        case 8:
            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(9);
-            LEX_ERROR();
-        case 9:
-            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(9);
+                ADVANCE(8);
            ACCEPT_TOKEN(ts_sym_number);
-        case 10:
+        case 9:
            ACCEPT_TOKEN(ts_aux_sym_token4);
-        case 11:
+        case 10:
            if (lookahead == 'a')
+                ADVANCE(11);
+            LEX_ERROR();
+        case 11:
+            if (lookahead == 'l')
                ADVANCE(12);
            LEX_ERROR();
        case 12:
-            if (lookahead == 'l')
+            if (lookahead == 's')
                ADVANCE(13);
            LEX_ERROR();
        case 13:
-            if (lookahead == 's')
+            if (lookahead == 'e')
                ADVANCE(14);
            LEX_ERROR();
        case 14:
-            if (lookahead == 'e')
-                ADVANCE(15);
-            LEX_ERROR();
-        case 15:
            ACCEPT_TOKEN(ts_sym_false);
-        case 16:
+        case 15:
            if (lookahead == 'u')
+                ADVANCE(16);
+            LEX_ERROR();
+        case 16:
+            if (lookahead == 'l')
                ADVANCE(17);
            LEX_ERROR();
        case 17:
@ -160,65 +157,71 @@ LEX_FN() {
                ADVANCE(18);
            LEX_ERROR();
        case 18:
-            if (lookahead == 'l')
-                ADVANCE(19);
-            LEX_ERROR();
-        case 19:
            ACCEPT_TOKEN(ts_sym_null);
-        case 20:
+        case 19:
            if (lookahead == 'r')
+                ADVANCE(20);
+            LEX_ERROR();
+        case 20:
+            if (lookahead == 'u')
                ADVANCE(21);
            LEX_ERROR();
        case 21:
-            if (lookahead == 'u')
+            if (lookahead == 'e')
                ADVANCE(22);
            LEX_ERROR();
        case 22:
-            if (lookahead == 'e')
-                ADVANCE(23);
-            LEX_ERROR();
-        case 23:
            ACCEPT_TOKEN(ts_sym_true);
-        case 24:
+        case 23:
            ACCEPT_TOKEN(ts_aux_sym_token0);
-        case 25:
+        case 24:
            START_TOKEN();
            if (lookahead == '\0')
-                ADVANCE(26);
+                ADVANCE(25);
            if ((lookahead == '\t') ||
                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(25);
+                ADVANCE(24);
            LEX_ERROR();
-        case 26:
+        case 25:
            ACCEPT_TOKEN(ts_builtin_sym_end);
-        case 27:
+        case 26:
            START_TOKEN();
            if (('\t' <= lookahead && lookahead <= '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(27);
+                ADVANCE(26);
            if (lookahead == '\"')
                ADVANCE(2);
            if (lookahead == '}')
-                ADVANCE(28);
+                ADVANCE(27);
            LEX_ERROR();
-        case 28:
+        case 27:
            ACCEPT_TOKEN(ts_aux_sym_token3);
-        case 29:
+        case 28:
            START_TOKEN();
            if (('\t' <= lookahead && lookahead <= '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(29);
+                ADVANCE(28);
            if (lookahead == ',')
+                ADVANCE(29);
+            if (lookahead == '}')
+                ADVANCE(27);
+            LEX_ERROR();
+        case 29:
+            ACCEPT_TOKEN(ts_aux_sym_token2);
+        case 30:
+            START_TOKEN();
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
+                (lookahead == '\r') ||
+                (lookahead == ' '))
                ADVANCE(30);
            if (lookahead == '}')
-                ADVANCE(28);
+                ADVANCE(27);
            LEX_ERROR();
-        case 30:
-            ACCEPT_TOKEN(ts_aux_sym_token2);
        case 31:
            START_TOKEN();
            if ((lookahead == '\t') ||
@ -226,8 +229,8 @@ LEX_FN() {
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(31);
-            if (lookahead == '}')
-                ADVANCE(28);
+            if (lookahead == '\"')
+                ADVANCE(2);
            LEX_ERROR();
        case 32:
            START_TOKEN();
@ -236,128 +239,118 @@ LEX_FN() {
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(32);
-            if (lookahead == '\"')
-                ADVANCE(2);
+            if (lookahead == ':')
+                ADVANCE(33);
            LEX_ERROR();
        case 33:
-            START_TOKEN();
-            if ((lookahead == '\t') ||
-                (lookahead == '\n') ||
-                (lookahead == '\r') ||
-                (lookahead == ' '))
-                ADVANCE(33);
-            if (lookahead == ':')
-                ADVANCE(34);
-            LEX_ERROR();
-        case 34:
            ACCEPT_TOKEN(ts_aux_sym_token1);
-        case 35:
+        case 34:
            START_TOKEN();
            if (('\t' <= lookahead && lookahead <= '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(35);
+                ADVANCE(34);
            if (lookahead == '\"')
                ADVANCE(2);
            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(7);
+                ADVANCE(6);
            if (lookahead == '[')
-                ADVANCE(10);
+                ADVANCE(9);
            if (lookahead == ']')
-                ADVANCE(36);
+                ADVANCE(35);
            if (lookahead == 'f')
-                ADVANCE(11);
+                ADVANCE(10);
            if (lookahead == 'n')
-                ADVANCE(16);
+                ADVANCE(15);
            if (lookahead == 't')
-                ADVANCE(20);
+                ADVANCE(19);
            if (lookahead == '{')
-                ADVANCE(24);
+                ADVANCE(23);
            LEX_ERROR();
-        case 36:
+        case 35:
            ACCEPT_TOKEN(ts_aux_sym_token5);
-        case 37:
+        case 36:
            START_TOKEN();
            if (('\t' <= lookahead && lookahead <= '\n') ||
+                (lookahead == '\r') ||
+                (lookahead == ' '))
+                ADVANCE(36);
+            if (lookahead == ',')
+                ADVANCE(29);
+            if (lookahead == ']')
+                ADVANCE(35);
+            LEX_ERROR();
+        case 37:
+            START_TOKEN();
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(37);
-            if (lookahead == ',')
-                ADVANCE(30);
            if (lookahead == ']')
-                ADVANCE(36);
+                ADVANCE(35);
            LEX_ERROR();
        case 38:
-            START_TOKEN();
-            if ((lookahead == '\t') ||
-                (lookahead == '\n') ||
-                (lookahead == '\r') ||
-                (lookahead == ' '))
-                ADVANCE(38);
-            if (lookahead == ']')
-                ADVANCE(36);
-            LEX_ERROR();
-        case 39:
            START_TOKEN();
            if (lookahead == '\0')
-                ADVANCE(26);
+                ADVANCE(25);
            if (('\t' <= lookahead && lookahead <= '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(39);
+                ADVANCE(38);
            if (lookahead == '\"')
                ADVANCE(2);
            if (lookahead == ',')
-                ADVANCE(30);
+                ADVANCE(29);
            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(7);
+                ADVANCE(6);
            if (lookahead == ':')
-                ADVANCE(34);
+                ADVANCE(33);
            if (lookahead == '[')
-                ADVANCE(10);
+                ADVANCE(9);
            if (lookahead == ']')
-                ADVANCE(36);
+                ADVANCE(35);
            if (lookahead == 'f')
-                ADVANCE(11);
+                ADVANCE(10);
            if (lookahead == 'n')
-                ADVANCE(16);
+                ADVANCE(15);
            if (lookahead == 't')
-                ADVANCE(20);
+                ADVANCE(19);
            if (lookahead == '{')
-                ADVANCE(24);
+                ADVANCE(23);
            if (lookahead == '}')
-                ADVANCE(28);
+                ADVANCE(27);
            LEX_ERROR();
        case ts_lex_state_error:
            START_TOKEN();
            if (lookahead == '\0')
-                ADVANCE(26);
+                ADVANCE(25);
            if (('\t' <= lookahead && lookahead <= '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
-                ADVANCE(39);
+                ADVANCE(38);
            if (lookahead == '\"')
                ADVANCE(2);
            if (lookahead == ',')
-                ADVANCE(30);
+                ADVANCE(29);
            if ('0' <= lookahead && lookahead <= '9')
-                ADVANCE(7);
+                ADVANCE(6);
            if (lookahead == ':')
-                ADVANCE(34);
+                ADVANCE(33);
            if (lookahead == '[')
-                ADVANCE(10);
+                ADVANCE(9);
            if (lookahead == ']')
-                ADVANCE(36);
+                ADVANCE(35);
            if (lookahead == 'f')
-                ADVANCE(11);
+                ADVANCE(10);
            if (lookahead == 'n')
-                ADVANCE(16);
+                ADVANCE(15);
            if (lookahead == 't')
-                ADVANCE(20);
+                ADVANCE(19);
            if (lookahead == '{')
-                ADVANCE(24);
+                ADVANCE(23);
            if (lookahead == '}')
-                ADVANCE(28);
+                ADVANCE(27);
            LEX_ERROR();
        default:
            LEX_PANIC();
@ -366,65 +359,65 @@ LEX_FN() {

 LEX_STATES = {
    [0] = 1,
-    [1] = 25,
-    [2] = 25,
-    [3] = 27,
-    [4] = 29,
-    [5] = 31,
-    [6] = 25,
-    [7] = 32,
-    [8] = 29,
-    [9] = 31,
-    [10] = 33,
+    [1] = 24,
+    [2] = 24,
+    [3] = 26,
+    [4] = 28,
+    [5] = 30,
+    [6] = 24,
+    [7] = 31,
+    [8] = 28,
+    [9] = 30,
+    [10] = 32,
    [11] = 1,
-    [12] = 29,
-    [13] = 31,
-    [14] = 29,
-    [15] = 27,
-    [16] = 29,
-    [17] = 31,
-    [18] = 29,
-    [19] = 33,
+    [12] = 28,
+    [13] = 30,
+    [14] = 28,
+    [15] = 26,
+    [16] = 28,
+    [17] = 30,
+    [18] = 28,
+    [19] = 32,
    [20] = 1,
-    [21] = 29,
-    [22] = 31,
-    [23] = 29,
-    [24] = 35,
-    [25] = 37,
-    [26] = 38,
-    [27] = 29,
+    [21] = 28,
+    [22] = 30,
+    [23] = 28,
+    [24] = 34,
+    [25] = 36,
+    [26] = 37,
+    [27] = 28,
    [28] = 1,
-    [29] = 37,
-    [30] = 38,
-    [31] = 37,
-    [32] = 27,
-    [33] = 29,
-    [34] = 31,
-    [35] = 37,
-    [36] = 33,
+    [29] = 36,
+    [30] = 37,
+    [31] = 36,
+    [32] = 26,
+    [33] = 28,
+    [34] = 30,
+    [35] = 36,
+    [36] = 32,
    [37] = 1,
-    [38] = 29,
-    [39] = 31,
-    [40] = 37,
-    [41] = 37,
-    [42] = 35,
-    [43] = 37,
-    [44] = 38,
-    [45] = 37,
-    [46] = 37,
-    [47] = 29,
-    [48] = 29,
-    [49] = 33,
+    [38] = 28,
+    [39] = 30,
+    [40] = 36,
+    [41] = 36,
+    [42] = 34,
+    [43] = 36,
+    [44] = 37,
+    [45] = 36,
+    [46] = 36,
+    [47] = 28,
+    [48] = 28,
+    [49] = 32,
    [50] = 1,
-    [51] = 29,
-    [52] = 31,
-    [53] = 25,
-    [54] = 25,
-    [55] = 35,
-    [56] = 37,
-    [57] = 38,
-    [58] = 25,
-    [59] = 25,
+    [51] = 28,
+    [52] = 30,
+    [53] = 24,
+    [54] = 24,
+    [55] = 34,
+    [56] = 36,
+    [57] = 37,
+    [58] = 24,
+    [59] = 24,
 };

 #pragma GCC diagnostic push
--- a/include/tree_sitter/compiler.h
+++ b/include/tree_sitter/compiler.h
@ -55,6 +55,7 @@ namespace tree_sitter {
    class GrammarError {
    public:
        GrammarError(GrammarErrorType type, std::string message);
+        bool operator==(const GrammarError &other) const;
        GrammarErrorType type;
        std::string message;
    };
--- a/spec/compiler/build_tables/item_set_transitions_spec.cc
+++ b/spec/compiler/build_tables/item_set_transitions_spec.cc
@ -13,8 +13,8 @@ describe("lexical item set transitions", []() {
    describe("when two items in the set have transitions on the same character", [&]() {
        it("merges the transitions by computing the union of the two item sets", [&]() {
            LexItemSet set1({
-                LexItem(Symbol(1), pattern("[a-f]")),
-                LexItem(Symbol(2), pattern("[e-x]")) });
+                LexItem(Symbol(1), character({ {'a', 'f'} })),
+                LexItem(Symbol(2), character({ {'e', 'x'} })) });

            AssertThat(char_transitions(set1, grammar), Equals(map<CharacterSet, LexItemSet>({
                { CharacterSet({ {'a', 'd'} }), LexItemSet({
--- a/spec/compiler/build_tables/rule_transitions_spec.cc
+++ b/spec/compiler/build_tables/rule_transitions_spec.cc
@ -97,23 +97,6 @@ describe("rule transitions", []() {
            })));
    });

-    it("handles strings", [&]() {
-        AssertThat(
-            char_transitions(str("bad")),
-            Equals(rule_map<CharacterSet>({
-                { CharacterSet({ 'b' }), seq({ character({ 'a' }), character({ 'd' }) }) }
-            })));
-    });
-
-    it("handles patterns", [&]() {
-        AssertThat(
-            char_transitions(pattern("a|b")),
-            Equals(rule_map<CharacterSet>({
-                { CharacterSet({ 'a' }), blank() },
-                { CharacterSet({ 'b' }), blank() }
-            })));
-    });
-
    it("handles choices between overlapping character sets", [&]() {
        AssertThat(
            char_transitions(choice({
@ -164,7 +147,7 @@ describe("rule transitions", []() {
    });

    it("handles repeats", [&]() {
-        rule_ptr rule = repeat(str("ab"));
+        rule_ptr rule = repeat(seq({ character({ 'a' }), character({ 'b' }) }));
        AssertThat(
            char_transitions(rule),
            Equals(rule_map<CharacterSet>({
@ -176,7 +159,7 @@ describe("rule transitions", []() {
                    })
                }})));

-        rule = repeat(str("a"));
+        rule = repeat(character({ 'a' }));
        AssertThat(
            char_transitions(rule),
            Equals(rule_map<CharacterSet>({
--- a/spec/compiler/helpers/rule_helpers.cc
+++ b/spec/compiler/helpers/rule_helpers.cc
@ -5,6 +5,7 @@
 namespace tree_sitter {
    using std::make_shared;
    using std::set;
+    using std::map;

    namespace rules {
        rule_ptr character(const set<CharacterRange> &ranges) {
@ -33,5 +34,9 @@ namespace tree_sitter {
        rule_ptr i_aux_token(size_t index) {
            return make_shared<rules::Symbol>(index, SymbolOption(SymbolOptionAuxiliary|SymbolOptionToken));
        }
+        
+        rule_ptr metadata(rule_ptr rule, map<MetadataKey, int> values) {
+            return make_shared<Metadata>(rule, values);
+        }
    }
 }
--- a/spec/compiler/helpers/rule_helpers.h
+++ b/spec/compiler/helpers/rule_helpers.h
@ -3,9 +3,11 @@

 #include "tree_sitter/compiler.h"
 #include "compiler/rules/character_set.h"
+#include "compiler/rules/metadata.h"

 namespace tree_sitter {
    namespace rules {
+        rule_ptr metadata(rule_ptr, std::map<MetadataKey, int>);
        rule_ptr character(const std::set<CharacterRange> &ranges);
        rule_ptr character(const std::set<CharacterRange> &ranges, bool sign);
        rule_ptr i_sym(size_t index);
--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -0,0 +1,63 @@
+#include "compiler_spec_helper.h"
+#include "compiler/prepared_grammar.h"
+#include "compiler/prepare_grammar/expand_tokens.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::expand_tokens;
+
+describe("expanding token rules", []() {
+    it("replaces regex patterns with their expansion", [&]() {
+        PreparedGrammar grammar({
+            { "rule_A", seq({
+                i_sym(10),
+                pattern("x*"),
+                i_sym(11) }) },
+        }, {});
+        
+        auto result = expand_tokens(grammar);
+        
+        AssertThat(result.second, Equals((const GrammarError *)nullptr));
+        AssertThat(result.first, Equals(PreparedGrammar({
+            { "rule_A", seq({
+                i_sym(10),
+                repeat(character({ 'x' })),
+                i_sym(11) }) },
+        }, {})));
+    });
+    
+    it("replaces string rules with a sequence of characters", [&]() {
+        PreparedGrammar grammar({
+            { "rule_A", seq({
+                i_sym(10),
+                str("xyz"),
+                i_sym(11) }) },
+        }, {});
+        
+        auto result = expand_tokens(grammar);
+        
+        AssertThat(result.second, Equals((const GrammarError *)nullptr));
+        AssertThat(result.first, Equals(PreparedGrammar({
+            { "rule_A", seq({
+                i_sym(10),
+                seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
+                i_sym(11) }) },
+        }, {})));
+    });
+    
+    it("returns an error when the grammar contains an invalid regex", [&]() {
+        PreparedGrammar grammar({
+            { "rule_A", seq({
+                pattern("("),
+                str("xyz"),
+                pattern("[") }) },
+        }, {});
+        
+        auto result = expand_tokens(grammar);
+
+        AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
+    });
+});
+
+END_TEST
--- a/spec/compiler/prepare_grammar/parse_regex_spec.cc
+++ b/spec/compiler/prepare_grammar/parse_regex_spec.cc
@ -0,0 +1,217 @@
+#include "compiler_spec_helper.h"
+#include "compiler/prepare_grammar/parse_regex.h"
+
+START_TEST
+
+using namespace rules;
+using prepare_grammar::parse_regex;
+
+describe("parsing regex patterns", []() {
+    vector<tuple<string, string, rule_ptr>> valid_inputs = {
+        {
+            "character sets",
+            "[aAeE]",
+            character({ 'a', 'A', 'e', 'E' })
+        },
+        
+        {
+            "'.' characters as wildcards",
+            ".",
+            CharacterSet({'\n'}).complement().copy()
+        },
+        
+        {
+            "character classes",
+            "\\w-\\d",
+            seq({
+                character({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'} }),
+                character({ '-' }),
+                character({ {'0', '9'} }) })
+        },
+        
+        {
+            "choices",
+            "ab|cd|ef",
+            choice({
+                seq({
+                    character({ 'a' }),
+                    character({ 'b' }),
+                }),
+                seq({
+                    character({ 'c' }),
+                    character({ 'd' })
+                }),
+                seq({
+                    character({ 'e' }),
+                    character({ 'f' })
+                })
+            })
+        },
+        
+        {
+            "simple sequences",
+            "abc",
+            seq({
+                character({ 'a' }),
+                character({ 'b' }),
+                character({ 'c' }) })
+        },
+        
+        {
+            "character ranges",
+            "[12a-dA-D3]",
+            character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, })
+        },
+        
+        {
+            "negated characters",
+            "[^a\\d]",
+            character({ {'a'}, {'0', '9'} }, false)
+        },
+        
+        {
+            "backslashes",
+            "\\\\",
+            character({ '\\' })
+        },
+        
+        {
+            "character groups in sequences",
+            "x([^x]|\\\\x)*x",
+            seq({
+                character({ 'x' }),
+                repeat(choice({
+                    character({ 'x' }, false),
+                    seq({ character({ '\\' }), character({ 'x' }) })
+                })),
+                character({ 'x' })
+            })
+        },
+        
+        {
+            "choices in sequences",
+            "(a|b)cd",
+            seq({
+                choice({
+                    character({ 'a' }),
+                    character({ 'b' }),
+                }),
+                character({ 'c' }),
+                character({ 'd' })
+            })
+        },
+        
+        {
+            "escaped parentheses",
+            "a\\(b",
+            seq({
+                character({ 'a' }),
+                character({ '(' }),
+                character({ 'b' })
+            })
+        },
+        
+        {
+            "escaped periods",
+            "a\\.",
+            seq({
+                character({ 'a' }),
+                character({ '.' })
+            })
+        },
+        
+        {
+            "plus repeats",
+            "(ab)+(cd)+",
+            seq({
+                seq({
+                    seq({ character({ 'a' }), character({ 'b' }) }),
+                    repeat(seq({ character({ 'a' }), character({ 'b' }) })),
+                }),
+                seq({
+                    seq({ character({ 'c' }), character({ 'd' }) }),
+                    repeat(seq({ character({ 'c' }), character({ 'd' }) })),
+                }),
+            })
+        },
+        
+        {
+            "asterix repeats",
+            "(ab)*(cd)*",
+            seq({
+                repeat(seq({ character({ 'a' }), character({ 'b' }) })),
+                repeat(seq({ character({ 'c' }), character({ 'd' }) })),
+            })
+        },
+        
+        {
+            "optional rules",
+            "a(bc)?",
+            seq({
+                character({ 'a' }),
+                choice({
+                    seq({ character({ 'b' }), character({ 'c' }) }),
+                    blank()
+                })
+            })
+        }
+    };
+    
+    vector<tuple<string, string, const char *>> invalid_inputs = {
+        {
+            "mismatched open parens",
+            "(a",
+            "unmatched open paren",
+        },
+        {
+            "mismatched nested open parens",
+            "((a) (b)",
+            "unmatched open paren",
+        },
+        {
+            "mismatched close parens",
+            "a)",
+            "unmatched close paren",
+        },
+        {
+            "mismatched nested close parens",
+            "((a) b))",
+            "unmatched close paren",
+        },
+        {
+            "mismatched brackets for character classes",
+            "[a",
+            "unmatched open square bracket",
+        },
+        {
+            "mismatched brackets for character classes",
+            "a]",
+            "unmatched close square bracket",
+        },
+    };
+    
+    for (auto &triple : valid_inputs) {
+        string description = get<0>(triple);
+        string regex = get<1>(triple);
+        rule_ptr rule = get<2>(triple);
+        
+        it(("parses " + description).c_str(), [&]() {
+            auto result = parse_regex(regex);
+            AssertThat(result.first, EqualsPointer(rule));
+        });
+    }
+    
+    for (auto &triple : invalid_inputs) {
+        string description = get<0>(triple);
+        string regex = get<1>(triple);
+        const char *expected_message = get<2>(triple);
+        
+        it(("handles invalid regexes with " + description).c_str(), [&]() {
+            auto result = parse_regex(regex);
+            AssertThat(result.second, !Equals((const GrammarError *)nullptr));
+            AssertThat(result.second->message, Contains(expected_message));
+        });
+    }
+});
+
+END_TEST
--- a/spec/compiler/rules/pattern_spec.cc
+++ b/spec/compiler/rules/pattern_spec.cc
@ -1,177 +0,0 @@
-#include "compiler_spec_helper.h"
-#include "compiler/rules/pattern.h"
-#include "compiler/rules/character_set.h"
-
-using namespace rules;
-
-START_TEST
-
-describe("parsing regex pattern rules", []() {
-    it("parses simple strings", [&]() {
-        Pattern rule("abc");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ 'a' }),
-                character({ 'b' }),
-                character({ 'c' })
-            })));
-    });
-
-    it("parses wildcard '.' characters", [&]() {
-        Pattern rule(".");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(CharacterSet({'\n'}).complement().copy()));
-    });
-
-    it("parses character classes", []() {
-        Pattern rule("\\w-\\d");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'} }),
-                character({ '-' }),
-                character({ {'0', '9'} })
-            })));
-    });
-
-    it("parses choices", []() {
-        Pattern rule("ab|cd|ef");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(choice({
-                seq({
-                    character({ 'a' }),
-                    character({ 'b' }),
-                }),
-                seq({
-                    character({ 'c' }),
-                    character({ 'd' })
-                }),
-                seq({
-                    character({ 'e' }),
-                    character({ 'f' })
-                })
-            })));
-    });
-
-    it("parses character sets", []() {
-        Pattern rule("[aAeE]");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(character({ 'a', 'A', 'e', 'E' })));
-    });
-
-    it("parses character ranges", []() {
-        Pattern rule("[12a-dA-D3]");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, })));
-    });
-
-    it("parses negated characters", []() {
-        Pattern rule("[^a\\d]");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(character({ {'a'}, {'0', '9'} }, false)));
-    });
-
-    it("parses backslashes", []() {
-        Pattern rule("\\\\");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(character({ '\\' })));
-    });
-
-    it("parses character groups in sequences", []() {
-        Pattern rule("\"([^\"]|\\\\\")*\"");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ '"' }),
-                repeat(choice({
-                    character({ '"' }, false),
-                    seq({ character({ '\\' }), character({ '"' }) })
-                })),
-                character({ '"' })
-            })));
-    });
-
-    it("parses choices in sequences", []() {
-        Pattern rule("(a|b)cd");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                choice({
-                    character({ 'a' }),
-                    character({ 'b' }),
-                }),
-                character({ 'c' }),
-                character({ 'd' })
-            })));
-    });
-
-    it("parses special characters when they are escaped", []() {
-        Pattern rule("a\\(b");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ 'a' }),
-                character({ '(' }),
-                character({ 'b' })
-            })));
-
-        Pattern rule2("a\\.");
-        AssertThat(
-            rule2.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ 'a' }),
-                character({ '.' }),
-            })));
-
-    });
-
-    it("parses repeating rules", []() {
-        Pattern rule("(ab)+(cd)+");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(
-                seq({
-                    seq({
-                        seq({ character({ 'a' }), character({ 'b' }) }),
-                        repeat(seq({ character({ 'a' }), character({ 'b' }) })),
-                    }),
-                    seq({
-                        seq({ character({ 'c' }), character({ 'd' }) }),
-                        repeat(seq({ character({ 'c' }), character({ 'd' }) })),
-                    }),
-                })
-            ));
-
-        Pattern rule2("(ab)*(cd)*");
-        AssertThat(
-            rule2.to_rule_tree(),
-            EqualsPointer(
-                seq({
-                    repeat(seq({ character({ 'a' }), character({ 'b' }) })),
-                    repeat(seq({ character({ 'c' }), character({ 'd' }) })),
-                })
-            ));
-    });
-
-    it("parses optional rules", []() {
-        Pattern rule("a(bc)?");
-        AssertThat(
-            rule.to_rule_tree(),
-            EqualsPointer(seq({
-                character({ 'a' }),
-                choice({
-                    seq({ character({ 'b' }), character({ 'c' }) }),
-                    blank()
-                })
-            })));
-   });
-});
-
-END_TEST
--- a/src/compiler/build_tables/rule_transitions.cc
+++ b/src/compiler/build_tables/rule_transitions.cc
@ -94,20 +94,6 @@ namespace tree_sitter {
                });
                return result;
            }
-
-            map<T, rule_ptr> apply_to(const rules::String *rule) {
-                rule_ptr result = make_shared<rules::Blank>();
-                for (char val : rule->value)
-                    result = rules::Seq::Build({
-                        result,
-                        CharacterSet({ val }).copy()
-                    });
-                return this->apply(result);
-            }
-
-            map<T, rule_ptr> apply_to(const rules::Pattern *rule) {
-                return this->apply(rule->to_rule_tree());
-            }
        };

        map<CharacterSet, rule_ptr> char_transitions(const rule_ptr &rule) {
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -113,8 +113,7 @@ namespace tree_sitter {
                } else if (symbol.is_token() && symbol.is_auxiliary()) {
                    return token_description(grammar_for_symbol(symbol).rule(symbol));
                } else {
-                    string name = grammar_for_symbol(symbol).rule_name(symbol);
-                    return name;
+                    return grammar_for_symbol(symbol).rule_name(symbol);
                }
            }

--- a/src/compiler/grammar.cc
+++ b/src/compiler/grammar.cc
@ -48,6 +48,10 @@ namespace tree_sitter {
    GrammarError::GrammarError(GrammarErrorType type, std::string message) :
        type(type),
        message(message) {}
+    
+    bool GrammarError::operator==(const GrammarError &other) const {
+        return type == other.type && message == other.message;
+    }

    ostream& operator<<(ostream &stream, const GrammarError *error) {
        if (error)
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -0,0 +1,68 @@
+#include "compiler/prepare_grammar/expand_tokens.h"
+#include <vector>
+#include <string>
+#include <utility>
+#include "compiler/prepared_grammar.h"
+#include "compiler/rules/visitor.h"
+#include "compiler/rules/pattern.h"
+#include "compiler/rules/string.h"
+#include "compiler/rules/blank.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/prepare_grammar/parse_regex.h"
+
+namespace tree_sitter {
+    using std::string;
+    using std::vector;
+    using std::pair;
+    using std::make_shared;
+    using rules::rule_ptr;
+    using rules::String;
+    using rules::Pattern;
+    
+    namespace prepare_grammar {
+        class ExpandTokens : public rules::IdentityRuleFn {
+            using rules::IdentityRuleFn::apply_to;
+
+            rule_ptr apply_to(const String *rule) {
+                vector<rule_ptr> elements;
+                for (char val : rule->value)
+                    elements.push_back(rules::CharacterSet({ val }).copy());
+                return rules::Seq::Build(elements);
+            }
+            
+            rule_ptr apply_to(const Pattern *rule) {
+                auto pair = parse_regex(rule->value);
+                if (!error)
+                    error = pair.second;
+                return pair.first;
+            }
+            
+        public:
+            const GrammarError *error;
+            ExpandTokens() : error(nullptr) {}
+        };
+        
+        pair<PreparedGrammar, const GrammarError *>
+        expand_tokens(const PreparedGrammar &grammar) {
+            vector<pair<string, rule_ptr>> rules, aux_rules;
+            ExpandTokens expander;
+            
+            for (auto &pair : grammar.rules) {
+                auto rule = expander.apply(pair.second);
+                if (expander.error)
+                    return { PreparedGrammar(), expander.error };
+                rules.push_back({ pair.first, rule });
+            }
+            
+            for (auto &pair : grammar.aux_rules) {
+                auto rule = expander.apply(pair.second);
+                if (expander.error)
+                    return { PreparedGrammar(), expander.error };
+                aux_rules.push_back({ pair.first, rule });
+            }
+            
+            return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
+        }
+    }
+}
--- a/src/compiler/prepare_grammar/expand_tokens.h
+++ b/src/compiler/prepare_grammar/expand_tokens.h
@ -0,0 +1,16 @@
+#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+
+#include "tree_sitter/compiler.h"
+
+namespace tree_sitter {
+    class PreparedGrammar;
+    
+    namespace prepare_grammar {
+        std::pair<PreparedGrammar, const GrammarError *>
+        expand_tokens(const PreparedGrammar &);
+    }
+}
+
+#endif  // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+
--- a/src/compiler/prepare_grammar/parse_regex.cc
+++ b/src/compiler/prepare_grammar/parse_regex.cc
@ -0,0 +1,210 @@
+#include "compiler/prepare_grammar/parse_regex.h"
+#include <string>
+#include <utility>
+#include "compiler/rules/choice.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/rules/blank.h"
+#include "compiler/util/string_helpers.h"
+
+namespace tree_sitter {
+    using std::string;
+    using std::vector;
+    using std::pair;
+    using std::make_shared;
+    using rules::rule_ptr;
+    using rules::CharacterSet;
+    using rules::Seq;
+    using rules::Blank;
+    using rules::Choice;
+    using rules::Repeat;
+    using rules::CharacterRange;
+    using rules::blank;
+    
+    namespace prepare_grammar {
+        class PatternParser {
+        public:
+            explicit PatternParser(const string &input) :
+                input(input),
+                length(input.length()),
+                position(0) {}
+            
+            pair<rule_ptr, const GrammarError *> rule(bool nested) {
+                vector<rule_ptr> choices = {};
+                do {
+                    if (!choices.empty()) {
+                        if (peek() == '|')
+                            next();
+                        else
+                            break;
+                    }
+                    auto pair = term(nested);
+                    if (pair.second)
+                        return { blank(), pair.second };
+                    choices.push_back(pair.first);
+                } while (has_more_input());
+                auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
+                return { rule, nullptr };
+            }
+            
+        private:
+            pair<rule_ptr, const GrammarError *> term(bool nested) {
+                rule_ptr result = blank();
+                do {
+                    if (peek() == '|')
+                        break;
+                    if (nested && peek() == ')')
+                        break;
+                    auto pair = factor();
+                    if (pair.second)
+                        return { blank(), pair.second };
+                    result = Seq::Build({ result, pair.first });
+                } while (has_more_input());
+                return { result, nullptr };
+            }
+            
+            pair<rule_ptr, const GrammarError *> factor() {
+                auto pair = atom();
+                if (pair.second)
+                    return { blank(), pair.second };
+                rule_ptr result = pair.first;
+                if (has_more_input()) {
+                    switch (peek()) {
+                        case '*':
+                            next();
+                            result = make_shared<Repeat>(result);
+                            break;
+                        case '+':
+                            next();
+                            result = make_shared<Seq>(result, make_shared<Repeat>(result));
+                            break;
+                        case '?':
+                            next();
+                            result = Choice::Build({ result, make_shared<Blank>() });
+                            break;
+                    }
+                }
+                return { result, nullptr };
+            }
+            
+            pair<rule_ptr, const GrammarError *> atom() {
+                switch (peek()) {
+                    case '(': {
+                        next();
+                        auto pair = rule(true);
+                        if (pair.second)
+                            return { blank(), pair.second };
+                        if (peek() != ')')
+                            return error("unmatched open paren");
+                        next();
+                        return { pair.first, nullptr };
+                    }
+                    case '[': {
+                        next();
+                        auto pair = char_set();
+                        if (pair.second) 
+                            return { blank(), pair.second };
+                        if (peek() != ']')
+                            return error("unmatched open square bracket");
+                        next();
+                        return { pair.first.copy(), nullptr };
+                    }
+                    case ')': {
+                        return error("unmatched close paren");
+                    }
+                    case ']': {
+                        return error("unmatched close square bracket");
+                    }
+                    case '.': {
+                        next();
+                        return { CharacterSet({ '\n' }).complement().copy(), nullptr };
+                    }
+                    default: {
+                        auto pair = single_char();
+                        if (pair.second) 
+                            return { blank(), pair.second };
+                        return { pair.first.copy(), nullptr };
+                    }
+                }
+            }
+            
+            pair<CharacterSet, const GrammarError *> char_set() {
+                bool is_affirmative = true;
+                if (peek() == '^') {
+                    next();
+                    is_affirmative = false;
+                }
+                CharacterSet result;
+                while (has_more_input() && (peek() != ']')) {
+                    auto pair = single_char();
+                    if (pair.second)
+                        return { CharacterSet(), pair.second };
+                    result.add_set(pair.first);
+                }
+                if (!is_affirmative)
+                    result = result.complement();
+                return { result, nullptr };
+            }
+            
+            pair<CharacterSet, const GrammarError *> single_char() {
+                CharacterSet value;
+                switch (peek()) {
+                    case '\\':
+                        next();
+                        value = escaped_char(peek());
+                        next();
+                        break;
+                    default:
+                        char first_char = peek();
+                        next();
+                        if (peek() == '-') {
+                            next();
+                            value = CharacterSet({ CharacterRange(first_char, peek()) });
+                            next();
+                        } else {
+                            value = CharacterSet({ first_char });
+                        }
+                }
+                return { value, nullptr };
+            }
+            
+            CharacterSet escaped_char(char value) {
+                switch (value) {
+                    case 'a':
+                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
+                    case 'w':
+                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
+                    case 'd':
+                        return CharacterSet({ {'0', '9'} });
+                    default:
+                        return CharacterSet({ value });
+                }
+            }
+            
+            void next() {
+                position++;
+            }
+            
+            char peek() {
+                return input[position];
+            }
+            
+            bool has_more_input() {
+                return position < length;
+            }
+            
+            pair<rule_ptr, const GrammarError *> error(string msg) {
+                return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
+            }
+            
+            const string input;
+            const size_t length;
+            size_t position;
+        };
+
+        pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
+            return PatternParser(input).rule(false);
+        }
+    }
+}
--- a/src/compiler/prepare_grammar/parse_regex.h
+++ b/src/compiler/prepare_grammar/parse_regex.h
@ -0,0 +1,16 @@
+#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
+#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
+
+#include "tree_sitter/compiler.h"
+#include <string>
+#include <utility>
+
+namespace tree_sitter {
+    namespace prepare_grammar {
+        std::pair<rules::rule_ptr, const GrammarError *>
+        parse_regex(const std::string &);
+    }
+}
+
+
+#endif  // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
--- a/src/compiler/prepare_grammar/prepare_grammar.cc
+++ b/src/compiler/prepare_grammar/prepare_grammar.cc
@ -2,8 +2,11 @@
 #include "compiler/prepared_grammar.h"
 #include "compiler/prepare_grammar/extract_tokens.h"
 #include "compiler/prepare_grammar/expand_repeats.h"
+#include "compiler/prepare_grammar/expand_tokens.h"
 #include "compiler/prepare_grammar/intern_symbols.h"

+#include "stream_methods.h"
+
 namespace tree_sitter {
    using std::tuple;
    using std::make_tuple;
@ -16,12 +19,17 @@ namespace tree_sitter {
            const GrammarError *error = result.second;

            if (error)
-                return make_tuple(PreparedGrammar({}, {}), PreparedGrammar({}, {}), error);
+                return make_tuple(PreparedGrammar(), PreparedGrammar(), error);

            auto grammars = extract_tokens(grammar);
            const PreparedGrammar &rule_grammar = expand_repeats(grammars.first);
-            const PreparedGrammar &lex_grammar = grammars.second;
-
+            auto expand_tokens_result = expand_tokens(grammars.second);
+            const PreparedGrammar &lex_grammar = expand_tokens_result.first;
+            error = expand_tokens_result.second;
+            
+            if (error)
+                return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
+            
            return make_tuple(rule_grammar, lex_grammar, nullptr);
        }
    }
--- a/src/compiler/prepared_grammar.cc
+++ b/src/compiler/prepared_grammar.cc
@ -10,6 +10,8 @@ namespace tree_sitter {
    using std::ostream;
    using rules::rule_ptr;
    using rules::Symbol;
+    
+    PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}

    PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
                                     const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules) :
--- a/src/compiler/prepared_grammar.h
+++ b/src/compiler/prepared_grammar.h
@ -14,6 +14,7 @@ namespace tree_sitter {

    class PreparedGrammar : public Grammar {
    public:
+        PreparedGrammar();
        PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
                        const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
        PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
--- a/src/compiler/rules/metadata.h
+++ b/src/compiler/rules/metadata.h
@ -11,6 +11,7 @@ namespace tree_sitter  {
            START_TOKEN,
            PRECEDENCE,
            IS_TOKEN,
+            DESCRIPTION,
        } MetadataKey;

        class Metadata : public Rule {
--- a/src/compiler/rules/pattern.cc
+++ b/src/compiler/rules/pattern.cc
@ -1,173 +1,12 @@
 #include "compiler/rules/pattern.h"
-#include <set>
 #include <string>
-#include <vector>
 #include "compiler/rules/visitor.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/rules/blank.h"
 #include "compiler/util/string_helpers.h"

 namespace tree_sitter {
    namespace rules {
        using std::string;
        using std::hash;
-        using std::make_shared;
-        using std::set;
-        using std::vector;
-
-        class PatternParser {
-        public:
-            explicit PatternParser(const string &input) :
-                input(input),
-                length(input.length()),
-                position(0) {}
-
-            rule_ptr rule() {
-                vector<rule_ptr> choices = { term() };
-                while (has_more_input() && peek() == '|') {
-                    next();
-                    choices.push_back(term());
-                }
-                return (choices.size() > 1) ? Choice::Build(choices) : choices.front();
-            }
-
-        private:
-            rule_ptr term() {
-                rule_ptr result = factor();
-                while (has_more_input() && (peek() != '|') && (peek() != ')'))
-                    result = Seq::Build({ result, factor() });
-                return result;
-            }
-
-            rule_ptr factor() {
-                rule_ptr result = atom();
-                if (has_more_input()) {
-                    switch (peek()) {
-                        case '*':
-                            next();
-                            result = make_shared<Repeat>(result);
-                            break;
-                        case '+':
-                            next();
-                            result = make_shared<Seq>(result, make_shared<Repeat>(result));
-                            break;
-                        case '?':
-                            next();
-                            result = Choice::Build({ result, make_shared<Blank>() });
-                            break;
-                    }
-                }
-                return result;
-            }
-
-            rule_ptr atom() {
-                rule_ptr result;
-                switch (peek()) {
-                    case '(':
-                        next();
-                        result = rule();
-                        if (has_error()) return result;
-                        if (peek() != ')') {
-                            error = "mismatched parens";
-                            return result;
-                        }
-                        next();
-                        break;
-                    case '[':
-                        next();
-                        result = char_set().copy();
-                        if (has_error()) return result;
-                        if (peek() != ']') {
-                            error = "mismatched square brackets";
-                            return result;
-                        }
-                        next();
-                        break;
-                    case ')':
-                        error = "mismatched parens";
-                        break;
-                    case '.':
-                        result = CharacterSet({ '\n' }).complement().copy();
-                        next();
-                        break;
-                    default:
-                        result = single_char().copy();
-                }
-                return result;
-            }
-
-            CharacterSet char_set() {
-                bool is_affirmative = true;
-                if (peek() == '^') {
-                    next();
-                    is_affirmative = false;
-                }
-                CharacterSet result;
-                while (has_more_input() && (peek() != ']'))
-                    result.add_set(single_char());
-                return is_affirmative ? result : result.complement();
-            }
-
-            CharacterSet single_char() {
-                CharacterSet value;
-                switch (peek()) {
-                    case '\\':
-                        next();
-                        value = escaped_char(peek());
-                        if (has_error()) return value;
-                        next();
-                        break;
-                    default:
-                        char first_char = peek();
-                        next();
-                        if (peek() == '-') {
-                            next();
-                            value = CharacterSet({ CharacterRange(first_char, peek()) });
-                            next();
-                        } else {
-                            value = CharacterSet({ first_char });
-                        }
-                }
-                return value;
-            }
-
-            CharacterSet escaped_char(char value) {
-                switch (value) {
-                    case 'a':
-                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
-                    case 'w':
-                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
-                    case 'd':
-                        return CharacterSet({ {'0', '9'} });
-                    default:
-                        return CharacterSet({ value });
-                }
-            }
-
-            void next() {
-                position++;
-            }
-
-            char peek() {
-                return input[position];
-            }
-
-            bool has_more_input() {
-                return position < length;
-            }
-
-            bool has_error() {
-                return error != "";
-            }
-
-            string error;
-            const string input;
-            const size_t length;
-            size_t position;
-        };

        Pattern::Pattern(const string &string) : value(string) {}

@ -191,9 +30,5 @@ namespace tree_sitter {
        void Pattern::accept(Visitor *visitor) const {
            visitor->visit(this);
        }
-
-        rule_ptr Pattern::to_rule_tree() const {
-            return PatternParser(value).rule();
-        }
    }
 }
--- a/src/compiler/rules/pattern.h
+++ b/src/compiler/rules/pattern.h
@ -17,7 +17,6 @@ namespace tree_sitter {
            void accept(Visitor *visitor) const;

            const std::string value;
-            rule_ptr to_rule_tree() const;
        };
    }
 }