Refactor - represent char sets in terms of inclusions and exclusions

2014-08-23 14:25:45 -07:00 · 2014-08-23 14:25:45 -07:00 · 0bb5663f0f
commit 0bb5663f0f
parent 6f374fddff
21 changed files with 1004 additions and 565 deletions
--- a/examples/parsers/arithmetic.c
+++ b/examples/parsers/arithmetic.c
@ -59,7 +59,8 @@ LEX_FN() {
    switch (lex_state) {
        case 1:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(1);
@ -88,7 +89,8 @@ LEX_FN() {
            START_TOKEN();
            if (lookahead == 0)
                ADVANCE(6);
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(5);
@ -117,7 +119,8 @@ LEX_FN() {
            ACCEPT_TOKEN(ts_aux_sym_5);
        case 12:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(12);
@ -150,7 +153,8 @@ LEX_FN() {
            START_TOKEN();
            if (lookahead == 0)
                ADVANCE(6);
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(15);
@ -178,7 +182,8 @@ LEX_FN() {
            START_TOKEN();
            if (lookahead == 0)
                ADVANCE(6);
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(15);
--- a/examples/parsers/golang.c
+++ b/examples/parsers/golang.c
@ -442,33 +442,33 @@ LEX_FN() {
                ADVANCE(3);
            LEX_ERROR();
        case 36:
-            if (!((lookahead == '\"') ||
-                (lookahead == '\\')))
-                ADVANCE(36);
            if (lookahead == '\"')
                ADVANCE(37);
            if (lookahead == '\\')
                ADVANCE(38);
+            if (!((lookahead == '\"') ||
+                (lookahead == '\\')))
+                ADVANCE(36);
            LEX_ERROR();
        case 37:
            ACCEPT_TOKEN(ts_sym_string);
        case 38:
-            if (!((lookahead == '\"') ||
-                (lookahead == '\\')))
-                ADVANCE(36);
            if (lookahead == '\"')
                ADVANCE(39);
            if (lookahead == '\\')
                ADVANCE(38);
-            LEX_ERROR();
-        case 39:
            if (!((lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(36);
+            LEX_ERROR();
+        case 39:
            if (lookahead == '\"')
                ADVANCE(37);
            if (lookahead == '\\')
                ADVANCE(38);
+            if (!((lookahead == '\"') ||
+                (lookahead == '\\')))
+                ADVANCE(36);
            ACCEPT_TOKEN(ts_sym_string);
        case 40:
            ACCEPT_TOKEN(ts_aux_sym_1);
@ -644,7 +644,8 @@ LEX_FN() {
            if (('0' <= lookahead && lookahead <= '9') ||
                ('A' <= lookahead && lookahead <= 'Z') ||
                (lookahead == '_') ||
-                ('a' <= lookahead && lookahead <= 'b') ||
+                (lookahead == 'a') ||
+                (lookahead == 'b') ||
                ('d' <= lookahead && lookahead <= 'z'))
                ADVANCE(33);
            if (lookahead == 'c')
@ -727,7 +728,8 @@ LEX_FN() {
            if (('0' <= lookahead && lookahead <= '9') ||
                ('A' <= lookahead && lookahead <= 'Z') ||
                (lookahead == '_') ||
-                ('a' <= lookahead && lookahead <= 'b') ||
+                (lookahead == 'a') ||
+                (lookahead == 'b') ||
                ('d' <= lookahead && lookahead <= 'z'))
                ADVANCE(33);
            if (lookahead == 'c')
@ -940,7 +942,8 @@ LEX_FN() {
                ADVANCE(88);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'e') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'q') ||
                ('s' <= lookahead && lookahead <= 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1310,7 +1313,8 @@ LEX_FN() {
                ADVANCE(88);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'd') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'q') ||
                ('s' <= lookahead && lookahead <= 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1542,7 +1546,8 @@ LEX_FN() {
                ADVANCE(115);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'e') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'q') ||
                ('s' <= lookahead && lookahead <= 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1617,7 +1622,8 @@ LEX_FN() {
                ADVANCE(82);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'e') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'q') ||
                ('s' <= lookahead && lookahead <= 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1703,7 +1709,8 @@ LEX_FN() {
                ADVANCE(145);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'e') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'q') ||
                ('s' <= lookahead && lookahead <= 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1850,9 +1857,11 @@ LEX_FN() {
                ADVANCE(115);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'd') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'l') ||
-                ('n' <= lookahead && lookahead <= 'o') ||
+                (lookahead == 'n') ||
+                (lookahead == 'o') ||
                (lookahead == 'q') ||
                (lookahead == 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
@ -1917,7 +1926,8 @@ LEX_FN() {
            if (('0' <= lookahead && lookahead <= '9') ||
                ('A' <= lookahead && lookahead <= 'Z') ||
                (lookahead == '_') ||
-                ('a' <= lookahead && lookahead <= 'b') ||
+                (lookahead == 'a') ||
+                (lookahead == 'b') ||
                ('d' <= lookahead && lookahead <= 'z'))
                ADVANCE(33);
            if (lookahead == 'c')
@ -2005,7 +2015,8 @@ LEX_FN() {
            if (('0' <= lookahead && lookahead <= '9') ||
                ('A' <= lookahead && lookahead <= 'Z') ||
                (lookahead == '_') ||
-                ('a' <= lookahead && lookahead <= 'b') ||
+                (lookahead == 'a') ||
+                (lookahead == 'b') ||
                ('d' <= lookahead && lookahead <= 'z'))
                ADVANCE(33);
            if (lookahead == 'c')
@ -2189,9 +2200,11 @@ LEX_FN() {
                ADVANCE(115);
            if (('A' <= lookahead && lookahead <= 'Z') ||
                ('a' <= lookahead && lookahead <= 'd') ||
-                ('g' <= lookahead && lookahead <= 'h') ||
+                (lookahead == 'g') ||
+                (lookahead == 'h') ||
                ('j' <= lookahead && lookahead <= 'l') ||
-                ('n' <= lookahead && lookahead <= 'o') ||
+                (lookahead == 'n') ||
+                (lookahead == 'o') ||
                (lookahead == 'q') ||
                (lookahead == 'u') ||
                ('w' <= lookahead && lookahead <= 'z'))
--- a/examples/parsers/javascript.c
+++ b/examples/parsers/javascript.c
--- a/examples/parsers/json.c
+++ b/examples/parsers/json.c
@ -60,7 +60,8 @@ LEX_FN() {
    switch (lex_state) {
        case 1:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(1);
@ -80,33 +81,33 @@ LEX_FN() {
                ADVANCE(23);
            LEX_ERROR();
        case 2:
-            if (!((lookahead == '\"') ||
-                (lookahead == '\\')))
-                ADVANCE(2);
            if (lookahead == '\"')
                ADVANCE(3);
            if (lookahead == '\\')
                ADVANCE(4);
+            if (!((lookahead == '\"') ||
+                (lookahead == '\\')))
+                ADVANCE(2);
            LEX_ERROR();
        case 3:
            ACCEPT_TOKEN(ts_sym_string);
        case 4:
-            if (!((lookahead == '\"') ||
-                (lookahead == '\\')))
-                ADVANCE(2);
            if (lookahead == '\"')
                ADVANCE(5);
            if (lookahead == '\\')
                ADVANCE(4);
-            LEX_ERROR();
-        case 5:
            if (!((lookahead == '\"') ||
                (lookahead == '\\')))
                ADVANCE(2);
+            LEX_ERROR();
+        case 5:
            if (lookahead == '\"')
                ADVANCE(3);
            if (lookahead == '\\')
                ADVANCE(4);
+            if (!((lookahead == '\"') ||
+                (lookahead == '\\')))
+                ADVANCE(2);
            ACCEPT_TOKEN(ts_sym_string);
        case 6:
            if (lookahead == '.')
@ -186,7 +187,8 @@ LEX_FN() {
            ACCEPT_TOKEN(ts_builtin_sym_end);
        case 26:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(26);
@ -199,7 +201,8 @@ LEX_FN() {
            ACCEPT_TOKEN(ts_aux_sym_4);
        case 28:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(28);
@ -224,7 +227,8 @@ LEX_FN() {
            ACCEPT_TOKEN(ts_aux_sym_6);
        case 30:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(30);
@ -247,7 +251,8 @@ LEX_FN() {
            LEX_ERROR();
        case 33:
            START_TOKEN();
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(33);
@ -292,7 +297,8 @@ LEX_FN() {
            START_TOKEN();
            if (lookahead == 0)
                ADVANCE(25);
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(38);
@ -320,9 +326,11 @@ LEX_FN() {
                ADVANCE(27);
            LEX_ERROR();
        case ts_lex_state_error:
+            START_TOKEN();
            if (lookahead == 0)
                ADVANCE(25);
-            if (('\t' <= lookahead && lookahead <= '\n') ||
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
                (lookahead == '\r') ||
                (lookahead == ' '))
                ADVANCE(38);
--- a/spec/compiler/build_tables/item_set_transitions_spec.cc
+++ b/spec/compiler/build_tables/item_set_transitions_spec.cc
@ -1,6 +1,7 @@
 #include "compiler/compiler_spec_helper.h"
 #include "compiler/build_tables/item_set_transitions.h"
 #include "compiler/prepared_grammar.h"
+#include "compiler/helpers/rule_helpers.h"

 using namespace rules;
 using namespace build_tables;
@ -11,16 +12,16 @@ describe("lexical item set transitions", []() {
  describe("when two items in the set have transitions on the same character", [&]() {
    it("merges the transitions by computing the union of the two item sets", [&]() {
      LexItemSet set1({
-          LexItem(Symbol(1), character({ {'a', 'f'} })),
-          LexItem(Symbol(2), character({ {'e', 'x'} })) });
+          LexItem(Symbol(1), CharacterSet().include('a', 'f').copy()),
+          LexItem(Symbol(2), CharacterSet().include('e', 'x').copy()) });

      AssertThat(char_transitions(set1), Equals(map<CharacterSet, LexItemSet>({
-          { CharacterSet({ {'a', 'd'} }), LexItemSet({
+          { CharacterSet().include('a', 'd'), LexItemSet({
              LexItem(Symbol(1), blank()) }) },
-          { CharacterSet({ {'e', 'f'} }), LexItemSet({
+          { CharacterSet().include('e', 'f'), LexItemSet({
              LexItem(Symbol(1), blank()),
              LexItem(Symbol(2), blank()) }) },
-          { CharacterSet({ {'g', 'x'} }), LexItemSet({
+          { CharacterSet().include('g', 'x'), LexItemSet({
              LexItem(Symbol(2), blank()) }) },
      })));
    });
--- a/spec/compiler/build_tables/merge_transitions_spec.cc
+++ b/spec/compiler/build_tables/merge_transitions_spec.cc
@ -6,7 +6,7 @@ using namespace build_tables;

 START_TEST

-describe("merging character set transitions", []() {
+describe("merge_char_transitions", []() {
  typedef map<CharacterSet, int> int_map;

  auto do_merge = [&](int_map *left, const pair<CharacterSet, int> &new_pair) {
@ -18,20 +18,20 @@ describe("merging character set transitions", []() {
  describe("when none of the transitions intersect", [&]() {
    it("returns the union of the two sets of transitions", [&]() {
      int_map map({
-          { CharacterSet({ 'a', 'c' }), 1 },
-          { CharacterSet({ 'x', 'y' }), 2 },
-          { CharacterSet({ '1', '9' }), 4 },
+          { CharacterSet().include('a').include('c'), 1 },
+          { CharacterSet().include('x').include('y'), 2 },
+          { CharacterSet().include('1').include('9'), 4 },
      });

-      do_merge(&map, { CharacterSet({ ' ' }), 8 });
-      do_merge(&map, { CharacterSet({ '\t' }), 16 });
+      do_merge(&map, { CharacterSet().include(' '), 8 });
+      do_merge(&map, { CharacterSet().include('\t'), 16 });

      AssertThat(map, Equals(int_map({
-          { CharacterSet({ 'a', 'c' }), 1 },
-          { CharacterSet({ 'x', 'y' }), 2 },
-          { CharacterSet({ '1', '9' }), 4 },
-          { CharacterSet({ ' ' }), 8 },
-          { CharacterSet({ '\t' }), 16 },
+          { CharacterSet().include('a').include('c'), 1 },
+          { CharacterSet().include('x').include('y'), 2 },
+          { CharacterSet().include('1').include('9'), 4 },
+          { CharacterSet().include(' '), 8 },
+          { CharacterSet().include('\t'), 16 },
      })));
    });
  });
@ -39,18 +39,33 @@ describe("merging character set transitions", []() {
  describe("when transitions intersect", [&]() {
    it("merges the intersecting transitions using the provided function", [&]() {
      int_map map({
-          { CharacterSet({ {'a', 'f'}, {'A', 'F'} }), 1 },
-          { CharacterSet({ {'0', '9'} }), 2 },
+          { CharacterSet().include('a', 'f').include('A', 'F'), 1 },
+          { CharacterSet().include('0', '9'), 2 },
      });

-      do_merge(&map, { CharacterSet({ 'c' }), 4 });
-      do_merge(&map, { CharacterSet({ '3' }), 8 });
+      do_merge(&map, { CharacterSet().include('c'), 4 });
+      do_merge(&map, { CharacterSet().include('3'), 8 });

      AssertThat(map, Equals(int_map({
-          { CharacterSet({ {'a', 'b'}, {'d', 'f'},  {'A', 'F'} }), 1 },
-          { CharacterSet({ {'c'} }), 5 },
-          { CharacterSet({ {'0', '2'}, {'4', '9'} }), 2 },
-          { CharacterSet({ '3' }), 10 },
+          {
+              CharacterSet()
+                .include('a', 'b')
+                .include('d', 'f')
+                .include('A', 'F'),
+              1
+          },
+          {
+              CharacterSet().include('c'),
+              5
+          },
+          {
+              CharacterSet().include('0', '2').include('4', '9'),
+              2
+          },
+          {
+              CharacterSet().include('3'),
+              10
+          },
      })));
    });
  });
@ -58,15 +73,15 @@ describe("merging character set transitions", []() {
  describe("when two of the right transitions intersect the same left transition", [&]() {
    it("splits the left-hand transition correctly", [&]() {
      int_map map1({
-          { CharacterSet({ 'a', 'c' }), 1 },
+          { CharacterSet().include('a').include('c'), 1 },
      });

-      do_merge(&map1, { CharacterSet({ 'a' }), 2 });
-      do_merge(&map1, { CharacterSet({ 'c' }), 4 });
+      do_merge(&map1, { CharacterSet().include('a'), 2 });
+      do_merge(&map1, { CharacterSet().include('c'), 4 });

      AssertThat(map1, Equals(int_map({
-          { CharacterSet({ 'a' }), 3 },
-          { CharacterSet({ 'c' }), 5 },
+          { CharacterSet().include('a'), 3 },
+          { CharacterSet().include('c'), 5 },
      })));
    });
  });
--- a/spec/compiler/build_tables/rule_transitions_spec.cc
+++ b/spec/compiler/build_tables/rule_transitions_spec.cc
@ -8,7 +8,7 @@ using namespace build_tables;

 START_TEST

-describe("rule transitions", []() {
+describe("sym_transitions", []() {
  it("handles symbols", [&]() {
    AssertThat(
        sym_transitions(i_sym(1)),
@ -74,11 +74,26 @@ describe("rule transitions", []() {
        })));
  });

+  it("preserves metadata", [&]() {
+    map<MetadataKey, int> metadata_value({
+        { PRECEDENCE, 5 }
+    });
+
+    rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
+    AssertThat(
+        sym_transitions(rule),
+        Equals(rule_map<Symbol>({
+            { Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
+        })));
+  });
+});
+
+describe("char_transitions", []() {
  it("handles characters", [&]() {
    AssertThat(
        char_transitions(character({ '1' })),
        Equals(rule_map<CharacterSet>({
-            { CharacterSet({ '1' }), blank() }
+            { CharacterSet().include('1'), blank() }
        })));
  });

@ -92,9 +107,35 @@ describe("rule transitions", []() {
                character({ { 'm', 'z' } }),
                sym("y") }) })),
        Equals(rule_map<CharacterSet>({
-            { CharacterSet({ {'a','l'} }), sym("x") },
-            { CharacterSet({ {'m','s'} }), choice({ sym("x"), sym("y") }) },
-            { CharacterSet({ {'t','z'} }), sym("y") },
+            { CharacterSet().include('a','l'), sym("x") },
+            { CharacterSet().include('m','s'), choice({ sym("x"), sym("y") }) },
+            { CharacterSet().include('t','z'), sym("y") },
+        })));
+  });
+
+  it("handles choices between whitelisted and blacklisted character sets", [&]() {
+    AssertThat(
+        char_transitions(seq({
+            choice({
+                character({ '/' }, false),
+                seq({
+                    character({ '\\' }),
+                    character({ '/' }) }) }),
+            character({ '/' }) })),
+
+        Equals(rule_map<CharacterSet>({
+            { CharacterSet()
+                  .include_all()
+                  .exclude('/')
+                  .exclude('\\'),
+              character({ '/' }) },
+            { CharacterSet()
+                  .include('\\'),
+              seq({
+                  choice({
+                      blank(),
+                      character({ '/' }) }),
+                  character({ '/' }) }) },
        })));
  });

@ -108,8 +149,8 @@ describe("rule transitions", []() {
                character({ { 'a', 'z' } }),
                sym("y") }) })),
        Equals(rule_map<CharacterSet>({
-            { CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
-            { CharacterSet({ {'d', 'z'} }), sym("y") },
+            { CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
+            { CharacterSet().include('d', 'z'), sym("y") },
        })));

    AssertThat(
@ -121,10 +162,9 @@ describe("rule transitions", []() {
                character({ {'a', 'c'} }),
                sym("y") }) })),
        Equals(rule_map<CharacterSet>({
-            { CharacterSet({ {'a', 'c'} }), choice({ sym("x"), sym("y") }) },
-            { CharacterSet({ {'d', 'z'} }), sym("x") },
+            { CharacterSet().include('a', 'c'), choice({ sym("x"), sym("y") }) },
+            { CharacterSet().include('d', 'z'), sym("x") },
        })));
-
  });

  it("handles blanks", [&]() {
@ -137,7 +177,7 @@ describe("rule transitions", []() {
        char_transitions(rule),
        Equals(rule_map<CharacterSet>({
            {
-                CharacterSet({ 'a' }),
+                CharacterSet().include('a'),
                seq({
                    character({ 'b' }),
                    rule,
@ -148,41 +188,9 @@ describe("rule transitions", []() {
    AssertThat(
        char_transitions(rule),
        Equals(rule_map<CharacterSet>({
-            { CharacterSet({ 'a' }), rule }
+            { CharacterSet().include('a'), rule }
        })));
  });
-
-  it("preserves metadata", [&]() {
-    map<MetadataKey, int> metadata_value({
-        { PRECEDENCE, 5 }
-    });
-
-    rule_ptr rule = make_shared<Metadata>(seq({ i_sym(1), i_sym(2) }), metadata_value);
-    AssertThat(
-        sym_transitions(rule),
-        Equals(rule_map<Symbol>({
-            { Symbol(1), make_shared<Metadata>(i_sym(2), metadata_value)},
-        })));
-  });
-
-  describe("regression tests (somewhat redundant, should maybe be deleted later)", []() {
-    it("handles sequences that start with repeating characters", [&]() {
-      auto rule = seq({
-          choice({
-              repeat(character({ '"' }, false)),
-              blank(),
-          }),
-          character({ '"' }),
-      });
-
-      AssertThat(char_transitions(rule), Equals(rule_map<CharacterSet>({
-          { CharacterSet({ '"' }).complement(), seq({
-              repeat(character({ '"' }, false)),
-              character({ '"' }), }) },
-          { CharacterSet({ '"' }), blank() },
-      })));
-    });
-  });
 });

 END_TEST
--- a/spec/compiler/helpers/containers.h
+++ b/spec/compiler/helpers/containers.h
@ -44,7 +44,7 @@ class rule_list : public vector<pair<string, rule_ptr>> {
      return true;
  }

-  rule_list(const initializer_list<pair<string, rule_ptr>> &list) : 
+  rule_list(const initializer_list<pair<string, rule_ptr>> &list) :
      vector<pair<string, rule_ptr>>(list) {}
 };

--- a/spec/compiler/helpers/rule_helpers.cc
+++ b/spec/compiler/helpers/rule_helpers.cc
@ -9,14 +9,20 @@ namespace tree_sitter {

  namespace rules {
    rule_ptr character(const set<CharacterRange> &ranges) {
-      return make_shared<CharacterSet>(ranges);
+      return character(ranges, true);
    }

    rule_ptr character(const set<CharacterRange> &ranges, bool sign) {
-      if (sign)
-        return character(ranges);
-      else
-        return CharacterSet(ranges).complement().copy();
+      CharacterSet result;
+      if (sign) {
+        for (auto &range : ranges)
+          result.include(range.min, range.max);
+      } else {
+        result.include_all();
+        for (auto &range : ranges)
+          result.exclude(range.min, range.max);
+      }
+      return result.copy();
    }

    rule_ptr i_sym(size_t index) {
--- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc
@ -71,7 +71,7 @@ describe("expanding repeat rules in a grammar", []() {
    AssertThat(match.rules, Equals(rule_list({
        { "rule0", seq({ i_aux_sym(0), i_aux_sym(1) }) },
    })));
-    
+
    AssertThat(match.aux_rules, Equals(rule_list({
        { "rule0_repeat0", choice({
            seq({
--- a/spec/compiler/prepare_grammar/parse_regex_spec.cc
+++ b/spec/compiler/prepare_grammar/parse_regex_spec.cc
@ -6,7 +6,7 @@ START_TEST
 using namespace rules;
 using prepare_grammar::parse_regex;

-describe("parsing regex patterns", []() {
+describe("parse_regex", []() {
  struct ValidInputRow {
    string description;
    string pattern;
@ -23,7 +23,7 @@ describe("parsing regex patterns", []() {
      {
          "'.' characters as wildcards",
          ".",
-          CharacterSet({'\n'}).complement().copy()
+          character({ '\n' }, false)
      },

      {
@ -170,6 +170,19 @@ describe("parsing regex patterns", []() {
                  blank()
              })
          })
+      },
+
+      {
+          "choices containing negated character classes",
+          "/([^/]|(\\\\/))*/",
+          seq({
+              character({ '/' }),
+              repeat(choice({
+                  character({ '/' }, false),
+                  seq({ character({ '\\' }), character({ '/' }) }),
+              })),
+              character({ '/' }),
+          }),
      }
  };

--- a/spec/compiler/rules/character_set_spec.cc
+++ b/spec/compiler/rules/character_set_spec.cc
@ -5,106 +5,327 @@ using namespace rules;

 START_TEST

-describe("character sets", []() {
-  unsigned char max_char = 255;
+describe("CharacterSet", []() {
+  describe("equality", [&]() {
+    it("returns true for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');

-  describe("computing the complement", [&]() {
-    it("works for the set containing only the null character", [&]() {
-      CharacterSet set1({ '\0' });
-      auto set2 = set1.complement();
-      AssertThat(set2, Equals(CharacterSet({
-          { 1, max_char }
-      })));
-      AssertThat(set2.complement(), Equals(set1));
+      CharacterSet set2 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');
+
+      AssertThat(set1, Equals(set2));
    });

-    it("works for single character sets", [&]() {
-      CharacterSet set1({ 'b' });
-      auto set2 = set1.complement();
-      AssertThat(set2, Equals(CharacterSet({
-          { 0, 'a' },
-          { 'c', max_char },
-      })));
-      AssertThat(set2.complement(), Equals(set1));
+    it("returns false for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+          .include('a', 'c')
+          .include('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'd')
+          .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+          .include_all()
+          .exclude('a', 'c')
+          .exclude('f', 'm');
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
+    });
+
+    it("returns false for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();
+
+      AssertThat(set1, !Equals(set2));
+      AssertThat(set2, !Equals(set1));
    });
  });

-  describe("computing unions", [&]() {
-    it("works for disjoint sets", [&]() {
-      CharacterSet set({ {'a', 'z'} });
-      set.add_set(CharacterSet({ {'A', 'Z'} }));
-      AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'} })));
+  describe("hashing", [&]() {
+    it("returns the same number for identical character sets", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');
+
+      AssertThat(set1.hash_code(), Equals(set2.hash_code()));
    });

-    it("works for sets with adjacent ranges", [&]() {
-      CharacterSet set({ CharacterRange('a', 'r') });
-      set.add_set(CharacterSet({ CharacterRange('s', 'z') }));
-      AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
+    it("returns different numbers for character sets that include different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'd')
+          .include('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+          .include('a', 'c')
+          .include('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
    });

-    it("becomes the complete set when the complement is added", [&]() {
-      CharacterSet set({ 'c' });
-      auto complement = set.complement();
-      set.add_set(complement);
-      AssertThat(set, Equals(CharacterSet({ {0, max_char} })));
+    it("returns different numbers for character sets that exclude different ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include_all()
+          .exclude('a', 'd')
+          .exclude('f', 'm');
+
+      CharacterSet set2 = CharacterSet()
+          .include_all()
+          .exclude('a', 'c')
+          .exclude('f', 'm');
+
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
    });

-    it("works when the result becomes a continuous range", []() {
-      CharacterSet set({ {'a', 'd'}, {'f', 'z'} });
-      set.add_set(CharacterSet({ {'c', 'g'} }));
-      AssertThat(set, Equals(CharacterSet({ {'a', 'z'} })));
-    });
+    it("returns different numbers for character sets with different sign", [&]() {
+      CharacterSet set1 = CharacterSet().include_all();
+      CharacterSet set2 = CharacterSet();

-    it("does nothing for the set of all characters", [&]() {
-      CharacterSet set({ 'a' });
-      set.add_set(set.complement());
-      AssertThat(set, Equals(CharacterSet({ {'\0', max_char} })));
+      AssertThat(set1.hash_code(), !Equals(set2.hash_code()));
+      AssertThat(set2.hash_code(), !Equals(set1.hash_code()));
    });
  });

-  describe("subtracting sets", []() {
+  describe("::is_empty", [&]() {
+    it("returns true for empty character sets", [&]() {
+      AssertThat(CharacterSet().is_empty(), Equals(true));
+    });
+
+    it("returns false for full character sets", [&]() {
+      AssertThat(CharacterSet().include_all().is_empty(), Equals(false));
+    });
+
+    it("returns false for character sets that include some characters", [&]() {
+      AssertThat(CharacterSet().include('x').is_empty(), Equals(false));
+    });
+  });
+
+  describe("::include", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("adds included characters", [&]() {
+        CharacterSet set1 = CharacterSet().include('a', 'd');
+        AssertThat(set1, Equals(CharacterSet()
+            .include('a')
+            .include('b')
+            .include('c')
+            .include('d')));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'g')
+            .include('c', 'e');
+        AssertThat(set1, Equals(CharacterSet()
+            .include_all()
+            .exclude('a')
+            .exclude('b')
+            .exclude('f')
+            .exclude('g')));
+      });
+
+      it("does nothing if the character are already not excluded", [&]() {
+        CharacterSet set1 = CharacterSet()
+            .include_all()
+            .include('a', 'c');
+        AssertThat(set1, Equals(CharacterSet().include_all()));
+      });
+    });
+  });
+
+  describe("::exclude", [&]() {
+    describe("when the set has a whitelist of characters", [&]() {
+      it("removes included characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+            .include('a', 'g')
+            .exclude('c', 'e');
+        AssertThat(set1, Equals(CharacterSet()
+            .include('a')
+            .include('b')
+            .include('f')
+            .include('g')));
+      });
+
+      it("does nothing if the character's are already not included", [&]() {
+        CharacterSet set1 = CharacterSet().exclude('a', 'c');
+        AssertThat(set1, Equals(CharacterSet()));
+      });
+    });
+
+    describe("when the set has a blacklist of characters", [&]() {
+      it("removes excluded characters", [&]() {
+        CharacterSet set1 = CharacterSet()
+            .include_all()
+            .exclude('a', 'd');
+        AssertThat(set1, Equals(CharacterSet()
+            .include_all()
+            .exclude('a')
+            .exclude('b')
+            .exclude('c')
+            .exclude('d')));
+      });
+    });
+  });
+
+  describe("::remove_set", []() {
    CharacterSet intersection;

-    it("works for disjoint sets", [&]() {
-      CharacterSet set1({ {'a', 'z'} });
-      intersection = set1.remove_set(CharacterSet({ {'A', 'Z'} }));
-      AssertThat(set1, Equals(CharacterSet({ {'a', 'z'} })));
-      AssertThat(intersection, Equals(CharacterSet()));
+    describe("for a set with whitelisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("removes the included characters that the other set also includes", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(set1, Equals(CharacterSet()
+              .include('a', 'c')
+              .include('t', 'z')));
+        });
+
+        it("returns the characters that were removed", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('d', 's'));
+          AssertThat(intersection, Equals(CharacterSet()
+              .include('d', 's')));
+        });
+
+        it("returns the empty set when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'z');
+          intersection = set1.remove_set(CharacterSet().include('A', 'Z'));
+          AssertThat(set1, Equals(CharacterSet().include('a', 'z')));
+          AssertThat(intersection, Equals(CharacterSet()));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("removes the included characters that are not excluded by the other set", [&]() {
+          CharacterSet set1 = CharacterSet().include('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+              .include_all()
+              .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+              .include('d', 'f')));
+          AssertThat(intersection, Equals(CharacterSet()
+              .include('a', 'c')));
+        });
+      });
    });

-    it("works when one set is a proper subset of the other", [&]() {
-      CharacterSet set1({ {'a','z'} });
-      intersection = set1.remove_set(CharacterSet({ {'d', 's'} }));
-      AssertThat(set1, Equals(CharacterSet({ {'a', 'c'}, {'t', 'z'} })));
-      AssertThat(intersection, Equals(CharacterSet({ {'d', 's'} })));
+    describe("for a set with blacklisted characters", [&]() {
+      describe("when the subtractend has whitelisted characters", [&]() {
+        it("adds the subtractend's inclusions to the receiver's exclusions", [&]() {
+          CharacterSet set1 = CharacterSet()
+              .include_all()
+              .exclude('a', 'f');
+
+          intersection = set1.remove_set(CharacterSet()
+              .include('x', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+              .include_all()
+              .exclude('a', 'f')
+              .exclude('x', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet().include('x', 'z')));
+        });
+      });
+
+      describe("when the subtractend has blacklisted characters", [&]() {
+        it("includes only the characters excluded by the subtractend but not by the receiver", [&]() {
+          CharacterSet set1 = CharacterSet()
+              .include_all()
+              .exclude('a', 'm');
+
+          set1.remove_set(CharacterSet()
+              .include_all()
+              .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+              .include('n', 'z')));
+        });
+
+        it("returns the characters excluded by neither set", [&]() {
+          CharacterSet set1 = CharacterSet()
+              .include_all()
+              .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+              .include_all()
+              .exclude('d', 'z'));
+
+          AssertThat(intersection, Equals(CharacterSet()
+              .include_all()
+              .exclude('a', 'z')));
+        });
+
+        it("works when the sets are disjoint", [&]() {
+          CharacterSet set1 = CharacterSet()
+              .include_all()
+              .exclude('a', 'm');
+
+          intersection = set1.remove_set(CharacterSet()
+              .include_all()
+              .exclude('d', 'z'));
+
+          AssertThat(set1, Equals(CharacterSet()
+              .include('n', 'z')));
+
+          AssertThat(intersection, Equals(CharacterSet()
+              .include_all()
+              .exclude('a', 'z')));
+        });
+      });
+    });
+  });
+
+  describe("::included_ranges", [&]() {
+    it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'c')
+          .include('g')
+          .include('z');
+
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+          CharacterRange('a', 'c'),
+          CharacterRange('g'),
+          CharacterRange('z'),
+      })));
    });

-    it("works for a set that overlaps the right side", [&]() {
-      CharacterSet set1({ {'a','s'} });
-      intersection = set1.remove_set(CharacterSet({ {'m', 'z'} }));
-      AssertThat(set1, Equals(CharacterSet({ {'a', 'l'} })));
-      AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
-    });
+    it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
+      CharacterSet set1 = CharacterSet()
+          .include('a', 'b')
+          .include('g')
+          .include('z');

-    it("works for a set that overlaps the left side", [&]() {
-      CharacterSet set2({ {'m','z'} });
-      intersection = set2.remove_set(CharacterSet({ {'a', 's'} }));
-      AssertThat(set2, Equals(CharacterSet({ {'t', 'z'} })));
-      AssertThat(intersection, Equals(CharacterSet({ {'m', 's'} })));
-    });
-
-    it("works for sets with multiple ranges", [&]() {
-      CharacterSet set1({ {'a', 'd'}, {'m', 'z'} });
-      intersection = set1.remove_set(CharacterSet({ {'c', 'o'}, {'s', 'x'} }));
-      AssertThat(set1, Equals(CharacterSet({ {'a', 'b'}, {'p', 'r'}, {'y', 'z'} })));
-      AssertThat(intersection, Equals(CharacterSet({ {'c', 'd'}, {'m', 'o'}, {'s', 'x'} })));
-    });
-
-    it("works when the result is empty", [&]() {
-      CharacterSet set1({ 'd' });
-      intersection = set1.remove_set(CharacterSet({ 'a', 'd', 'x' }));
-      AssertThat(set1, Equals(CharacterSet()));
-      AssertThat(intersection, Equals(CharacterSet({ 'd' })));
+      AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
+          CharacterRange('a'),
+          CharacterRange('b'),
+          CharacterRange('g'),
+          CharacterRange('z'),
+      })));
    });
  });
 });
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -38,8 +38,8 @@ class LexTableBuilder {
      if (symbol == rules::ERROR())
        continue;
      else if (symbol == rules::END_OF_INPUT())
-        result.insert(
-            LexItem(symbol, after_separators(CharacterSet({ 0 }).copy())));
+        result.insert(LexItem(
+            symbol, after_separators(CharacterSet().include(0).copy())));
      else if (symbol.is_token())
        result.insert(
            LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
@ -52,9 +52,11 @@ class LexTableBuilder {
    if (pair == lex_state_ids.end()) {
      LexStateId state_id = lex_table.add_state();
      lex_state_ids[item_set] = state_id;
+
      add_accept_token_actions(item_set, state_id);
      add_advance_actions(item_set, state_id);
      add_token_start(item_set, state_id);
+
      return state_id;
    } else {
      return pair->second;
@ -100,10 +102,10 @@ class LexTableBuilder {
  }

  CharacterSet separator_set() const {
-    set<rules::CharacterRange> ranges;
+    CharacterSet result;
    for (char c : lex_grammar.separators)
-      ranges.insert(c);
-    return CharacterSet(ranges);
+      result.include(c);
+    return result;
  }

  rules::rule_ptr after_separators(rules::rule_ptr rule) {
--- a/src/compiler/build_tables/merge_transitions.h
+++ b/src/compiler/build_tables/merge_transitions.h
@ -68,7 +68,7 @@ void merge_char_transition(std::map<rules::CharacterSet, T> *left,
  left->insert(pairs_to_insert.begin(), pairs_to_insert.end());

  if (!new_char_set.is_empty())
-    left->insert({ new_char_set, new_pair.second });
+    left->insert({ new_char_set, new_value });
 }

 }  // namespace build_tables
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -245,12 +245,13 @@ class CCodeGenerator {
    }
  }

-  void condition_for_character_set(const rules::CharacterSet &set) {
-    if (set.ranges.size() == 1) {
-      add(condition_for_character_range(*set.ranges.begin()));
+  void condition_for_character_ranges(
+      const vector<rules::CharacterRange> &ranges) {
+    if (ranges.size() == 1) {
+      add(condition_for_character_range(*ranges.begin()));
    } else {
      bool first = true;
-      for (auto &match : set.ranges) {
+      for (auto &match : ranges) {
        string part = "(" + condition_for_character_range(match) + ")";
        if (first) {
          add(part);
@ -263,15 +264,13 @@ class CCodeGenerator {
    }
  }

-  void condition_for_character_rule(const rules::CharacterSet &rule) {
-    pair<rules::CharacterSet, bool> representation =
-        rule.most_compact_representation();
-    if (representation.second) {
-      condition_for_character_set(representation.first);
-    } else {
+  void condition_for_character_set(const rules::CharacterSet &rule) {
+    if (rule.includes_all) {
      add("!(");
-      condition_for_character_set(rule.complement());
+      condition_for_character_ranges(rule.excluded_ranges());
      add(")");
+    } else {
+      condition_for_character_ranges(rule.included_ranges());
    }
  }

@ -319,7 +318,7 @@ class CCodeGenerator {
      line("START_TOKEN();");
    for (auto pair : lex_state.actions)
      if (!pair.first.is_empty())
-        _if([&]() { condition_for_character_rule(pair.first); },
+        _if([&]() { condition_for_character_set(pair.first); },
            [&]() { code_for_lex_actions(pair.second, expected_inputs); });
    code_for_lex_actions(lex_state.default_action, expected_inputs);
  }
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -28,7 +28,7 @@ class ExpandTokens : public rules::IdentityRuleFn {
  rule_ptr apply_to(const String *rule) {
    vector<rule_ptr> elements;
    for (char val : rule->value)
-      elements.push_back(rules::CharacterSet({ val }).copy());
+      elements.push_back(rules::CharacterSet().include(val).copy());
    return rules::Seq::Build(elements);
  }

--- a/src/compiler/prepare_grammar/parse_regex.cc
+++ b/src/compiler/prepare_grammar/parse_regex.cc
@ -115,7 +115,7 @@ class PatternParser {
      case ']': { return error("unmatched close square bracket"); }
      case '.': {
        next();
-        return { CharacterSet({ '\n' }).complement().copy(), nullptr };
+        return { CharacterSet().include_all().exclude('\n').copy(), nullptr };
      }
      default: {
        auto pair = single_char();
@ -127,20 +127,24 @@ class PatternParser {
  }

  pair<CharacterSet, const GrammarError *> char_set() {
+    CharacterSet result;
    bool is_affirmative = true;
    if (peek() == '^') {
      next();
      is_affirmative = false;
+      result.include_all();
    }
-    CharacterSet result;
+
    while (has_more_input() && (peek() != ']')) {
      auto pair = single_char();
      if (pair.second)
        return { CharacterSet(), pair.second };
-      result.add_set(pair.first);
+      if (is_affirmative)
+        result.add_set(pair.first);
+      else
+        result.remove_set(pair.first);
    }
-    if (!is_affirmative)
-      result = result.complement();
+
    return { result, nullptr };
  }

@ -157,10 +161,10 @@ class PatternParser {
        next();
        if (peek() == '-') {
          next();
-          value = CharacterSet({ CharacterRange(first_char, peek()) });
+          value = CharacterSet().include(first_char, peek());
          next();
        } else {
-          value = CharacterSet({ first_char });
+          value = CharacterSet().include(first_char);
        }
    }
    return { value, nullptr };
@ -169,19 +173,20 @@ class PatternParser {
  CharacterSet escaped_char(char value) {
    switch (value) {
      case 'a':
-        return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' } });
+        return CharacterSet().include('a', 'z').include('A', 'Z');
      case 'w':
-        return CharacterSet({ { 'a', 'z' }, { 'A', 'Z' }, { '0', '9' } });
+        return CharacterSet().include('a', 'z').include('A', 'Z').include('0',
+                                                                          '9');
      case 'd':
-        return CharacterSet({ { '0', '9' } });
+        return CharacterSet().include('0', '9');
      case 't':
-        return CharacterSet({ '\t' });
+        return CharacterSet().include('\t');
      case 'n':
-        return CharacterSet({ '\n' });
+        return CharacterSet().include('\n');
      case 'r':
-        return CharacterSet({ '\r' });
+        return CharacterSet().include('\r');
      default:
-        return CharacterSet({ value });
+        return CharacterSet().include(value);
    }
  }

--- a/src/compiler/rules/character_range.cc
+++ b/src/compiler/rules/character_range.cc
@ -5,6 +5,7 @@
 namespace tree_sitter {
 namespace rules {

+using std::ostream;
 using std::string;

 static const unsigned char MAX_CHAR = -1;
@ -53,5 +54,9 @@ string CharacterRange::to_string() const {
    return string() + escape_character(min) + "-" + escape_character(max);
 }

+ostream &operator<<(ostream &stream, const CharacterRange &range) {
+  return stream << range.to_string();
+}
+
 }  // namespace rules
 }  // namespace tree_sitter
--- a/src/compiler/rules/character_range.h
+++ b/src/compiler/rules/character_range.h
@ -20,6 +20,8 @@ struct CharacterRange {
  std::string to_string() const;
 };

+std::ostream &operator<<(std::ostream &stream, const CharacterRange &rule);
+
 }  // namespace rules
 }  // namespace tree_sitter

--- a/src/compiler/rules/character_set.cc
+++ b/src/compiler/rules/character_set.cc
@ -1,6 +1,7 @@
 #include "compiler/rules/character_set.h"
 #include <string>
 #include <utility>
+#include <vector>
 #include "compiler/rules/visitor.h"

 namespace tree_sitter {
@ -9,32 +10,87 @@ namespace rules {
 using std::string;
 using std::hash;
 using std::set;
-using std::pair;
-using std::initializer_list;
+using std::vector;

-static const unsigned char MAX_CHAR = -1;
+static void add_range(set<uint32_t> *characters, CharacterRange range) {
+  for (uint32_t c = range.min; c <= range.max; c++)
+    characters->insert(c);
+}

-CharacterSet::CharacterSet() : ranges({}) {}
-CharacterSet::CharacterSet(const set<CharacterRange> &ranges)
-    : ranges(ranges) {}
-CharacterSet::CharacterSet(const initializer_list<CharacterRange> &ranges)
-    : ranges(ranges) {}
+static void remove_range(set<uint32_t> *characters, CharacterRange range) {
+  for (uint32_t c = range.min; c <= range.max; c++)
+    characters->erase(c);
+}
+
+static set<uint32_t> remove_chars(set<uint32_t> *left,
+                                  const set<uint32_t> &right) {
+  set<uint32_t> result;
+  for (uint32_t c : right) {
+    if (left->erase(c))
+      result.insert(c);
+  }
+  return result;
+}
+
+static set<uint32_t> add_chars(set<uint32_t> *left,
+                               const set<uint32_t> &right) {
+  set<uint32_t> result;
+  for (uint32_t c : right)
+    if (left->insert(c).second)
+      result.insert(c);
+  return result;
+}
+
+static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &chars) {
+  vector<CharacterRange> result;
+  for (uint32_t c : chars) {
+    size_t size = result.size();
+    if (size >= 2 && result[size - 2].max == (c - 2)) {
+      result.pop_back();
+      result.back().max = c;
+    } else if (size >= 1) {
+      CharacterRange &last = result.back();
+      if (last.min < last.max && last.max == (c - 1))
+        last.max = c;
+      else
+        result.push_back(c);
+    } else {
+      result.push_back(c);
+    }
+  }
+  return result;
+}
+
+CharacterSet::CharacterSet()
+    : includes_all(false), included_chars({}), excluded_chars({}) {}

 bool CharacterSet::operator==(const Rule &rule) const {
  const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
-  return other && (ranges == other->ranges);
+  return other && (includes_all == other->includes_all) &&
+         (included_chars == other->included_chars) &&
+         (excluded_chars == other->excluded_chars);
 }

 bool CharacterSet::operator<(const CharacterSet &other) const {
-  return ranges < other.ranges;
+  if (!includes_all && other.includes_all)
+    return true;
+  if (includes_all && !other.includes_all)
+    return false;
+  if (included_chars < other.included_chars)
+    return true;
+  if (other.included_chars < included_chars)
+    return false;
+  return excluded_chars < other.excluded_chars;
 }

 size_t CharacterSet::hash_code() const {
-  size_t result = std::hash<size_t>()(ranges.size());
-  for (auto &range : ranges) {
-    result ^= std::hash<unsigned char>()(range.min);
-    result ^= std::hash<unsigned char>()(range.max);
-  }
+  size_t result = hash<bool>()(includes_all);
+  result ^= hash<size_t>()(included_chars.size());
+  for (auto &c : included_chars)
+    result ^= hash<uint32_t>()(c);
+  result ^= hash<size_t>()(excluded_chars.size());
+  for (auto &c : excluded_chars)
+    result ^= hash<uint32_t>()(c);
  return result;
 }

@ -44,97 +100,88 @@ rule_ptr CharacterSet::copy() const {

 string CharacterSet::to_string() const {
  string result("(char");
-  for (auto &range : ranges)
-    result += " " + range.to_string();
+  if (includes_all)
+    result += " include_all";
+  if (!included_chars.empty()) {
+    result += " (include";
+    for (auto r : included_ranges())
+      result += string(" ") + r.to_string();
+    result += ")";
+  }
+  if (!excluded_chars.empty()) {
+    result += " (exclude";
+    for (auto r : excluded_ranges())
+      result += string(" ") + r.to_string();
+    result += ")";
+  }
  return result + ")";
 }

-CharacterSet CharacterSet::complement() const {
-  CharacterSet result({ { 0, MAX_CHAR } });
-  result.remove_set(*this);
-  return result;
+CharacterSet &CharacterSet::include_all() {
+  includes_all = true;
+  return *this;
 }

-std::pair<CharacterSet, bool> CharacterSet::most_compact_representation()
-    const {
-  auto first_range = *ranges.begin();
-  if (first_range.min == 0 && first_range.max > 0) {
-    return { this->complement(), false };
-  } else {
-    return { *this, true };
-  }
+CharacterSet &CharacterSet::include(uint32_t min, uint32_t max) {
+  if (includes_all)
+    remove_range(&excluded_chars, CharacterRange(min, max));
+  else
+    add_range(&included_chars, CharacterRange(min, max));
+  return *this;
 }

-void add_range(CharacterSet *self, CharacterRange addition) {
-  set<CharacterRange> new_ranges;
-  for (auto range : self->ranges) {
-    bool is_adjacent = false;
-    if (range.min < addition.min && range.max >= addition.min - 1) {
-      is_adjacent = true;
-      addition.min = range.min;
-    }
-    if (range.max > addition.max && range.min <= addition.max + 1) {
-      is_adjacent = true;
-      addition.max = range.max;
-    }
-    if (!is_adjacent) {
-      new_ranges.insert(range);
-    }
-  }
-  new_ranges.insert(addition);
-  self->ranges = new_ranges;
+CharacterSet &CharacterSet::exclude(uint32_t min, uint32_t max) {
+  if (includes_all)
+    add_range(&excluded_chars, CharacterRange(min, max));
+  else
+    remove_range(&included_chars, CharacterRange(min, max));
+  return *this;
 }

-CharacterSet remove_range(CharacterSet *self, CharacterRange range_to_remove) {
-  CharacterSet removed_set;
-  set<CharacterRange> new_ranges;
-  for (auto range : self->ranges) {
-    if (range_to_remove.min <= range.min) {
-      if (range_to_remove.max < range.min) {
-        new_ranges.insert(range);
-      } else if (range_to_remove.max < range.max) {
-        new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
-        add_range(&removed_set, CharacterRange(range.min, range_to_remove.max));
-      } else {
-        add_range(&removed_set, range);
-      }
-    } else if (range_to_remove.min <= range.max) {
-      if (range_to_remove.max < range.max) {
-        new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
-        new_ranges.insert(CharacterRange(range_to_remove.max + 1, range.max));
-        add_range(&removed_set, range_to_remove);
-      } else {
-        new_ranges.insert(CharacterRange(range.min, range_to_remove.min - 1));
-        add_range(&removed_set, CharacterRange(range_to_remove.min, range.max));
-      }
-    } else {
-      new_ranges.insert(range);
-    }
-  }
-  self->ranges = new_ranges;
-  return removed_set;
-}
+CharacterSet &CharacterSet::include(uint32_t c) { return include(c, c); }

-bool CharacterSet::is_empty() const { return ranges.empty(); }
+CharacterSet &CharacterSet::exclude(uint32_t c) { return exclude(c, c); }
+
+bool CharacterSet::is_empty() const {
+  return !includes_all && included_chars.empty();
+}

 void CharacterSet::add_set(const CharacterSet &other) {
-  for (auto &other_range : other.ranges) {
-    add_range(this, other_range);
-  }
+  for (uint32_t c : other.included_chars)
+    included_chars.insert(c);
 }

 CharacterSet CharacterSet::remove_set(const CharacterSet &other) {
  CharacterSet result;
-  for (auto &other_range : other.ranges) {
-    auto removed_set = remove_range(this, other_range);
-    result.add_set(removed_set);
+  if (includes_all) {
+    if (other.includes_all) {
+      result.includes_all = true;
+      result.excluded_chars = excluded_chars;
+      included_chars = add_chars(&result.excluded_chars, other.excluded_chars);
+      excluded_chars = {};
+      includes_all = false;
+    } else {
+      result.included_chars = add_chars(&excluded_chars, other.included_chars);
+    }
+  } else {
+    if (other.includes_all) {
+      result.included_chars = included_chars;
+      included_chars =
+          remove_chars(&result.included_chars, other.excluded_chars);
+    } else {
+      result.included_chars =
+          remove_chars(&included_chars, other.included_chars);
+    }
  }
  return result;
 }

-CharacterSet CharacterSet::intersect(const CharacterSet &set) const {
-  CharacterSet copy = *this;
-  return copy.remove_set(set);
+vector<CharacterRange> CharacterSet::included_ranges() const {
+  return consolidate_ranges(included_chars);
+}
+
+vector<CharacterRange> CharacterSet::excluded_ranges() const {
+  return consolidate_ranges(excluded_chars);
 }

 void CharacterSet::accept(Visitor *visitor) const { visitor->visit(this); }
--- a/src/compiler/rules/character_set.h
+++ b/src/compiler/rules/character_set.h
@ -1,10 +1,11 @@
 #ifndef COMPILER_RULES_CHARACTER_SET_H_
 #define COMPILER_RULES_CHARACTER_SET_H_

-#include <initializer_list>
 #include <set>
+#include <stdint.h>
 #include <string>
 #include <utility>
+#include <vector>
 #include "compiler/rules/rule.h"
 #include "compiler/rules/character_range.h"

@ -14,8 +15,12 @@ namespace rules {
 class CharacterSet : public Rule {
 public:
  CharacterSet();
-  explicit CharacterSet(const std::set<CharacterRange> &ranges);
-  explicit CharacterSet(const std::initializer_list<CharacterRange> &ranges);
+
+  CharacterSet &include_all();
+  CharacterSet &include(uint32_t c);
+  CharacterSet &include(uint32_t min, uint32_t max);
+  CharacterSet &exclude(uint32_t c);
+  CharacterSet &exclude(uint32_t min, uint32_t max);

  bool operator==(const Rule &other) const;
  bool operator<(const CharacterSet &) const;
@ -26,12 +31,14 @@ class CharacterSet : public Rule {

  void add_set(const CharacterSet &other);
  CharacterSet remove_set(const CharacterSet &other);
-  CharacterSet complement() const;
-  CharacterSet intersect(const CharacterSet &) const;
-  std::pair<CharacterSet, bool> most_compact_representation() const;
  bool is_empty() const;

-  std::set<CharacterRange> ranges;
+  std::vector<CharacterRange> included_ranges() const;
+  std::vector<CharacterRange> excluded_ranges() const;
+
+  bool includes_all;
+  std::set<uint32_t> included_chars;
+  std::set<uint32_t> excluded_chars;
 };

 }  // namespace rules