Implement character set difference

2014-02-07 12:57:35 -08:00 · 2014-02-07 12:57:35 -08:00 · df3397f02c
commit df3397f02c
parent b94fa3ed35
7 changed files with 200 additions and 139 deletions
--- a/character_set_spec.cpp
+++ b/character_set_spec.cpp
@ -30,33 +30,63 @@ describe("character sets", []() {
    describe("computing unions", []() {
        it("works for disjoint sets", []() {
            CharacterSet set({ {'a', 'z'} }, true);
-            set.union_with(CharacterSet({ {'A', 'Z'} }, true));
+            set.add_set(CharacterSet({ {'A', 'Z'} }, true));
            AssertThat(set, Equals(CharacterSet({ {'a', 'z'}, {'A', 'Z'}, })));
        });
        
        it("works for sets with adjacent ranges", []() {
            CharacterSet set({ {'a', 'r'} }, true);
-            set.union_with(CharacterSet({ {'s', 'z'} }, true));
+            set.add_set(CharacterSet({ {'s', 'z'} }, true));
            AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true)));

            set = CharacterSet({ 'c' });
            auto c = set.complement();
-            set.union_with(c);
+            set.add_set(c);
            AssertThat(set, Equals(CharacterSet({ {0, -1} }, true)));
        });
        
        it("works when the result becomes a continuous range", []() {
            CharacterSet set({ {'a', 'd'}, {'f', 'z'} }, true);
-            set.union_with(CharacterSet({ {'c', 'g'} }, true));
+            set.add_set(CharacterSet({ {'c', 'g'} }, true));
            AssertThat(set, Equals(CharacterSet({ {'a', 'z'} }, true)));
        });
        
        it("does nothing for the set of all characters", []() {
            CharacterSet set({ 'a' });
-            set.union_with(set.complement());
+            set.add_set(set.complement());
            AssertThat(set, Equals(CharacterSet({ {'\0', '\xff'} }, true)));
        });
    });
+    
+    describe("computing differences", []() {
+        it("works for disjoint sets", []() {
+            CharacterSet set1({ {'a','z'} }, true);
+            set1.remove_set(CharacterSet({ {'A','Z'} }, true));
+            AssertThat(set1, Equals(CharacterSet({ {'a', 'z'} }, true)));
+        });
+        
+        it("works when one set spans the other", []() {
+            CharacterSet set1({ {'a','z'} }, true);
+            set1.remove_set(CharacterSet({ {'d','s'} }, true));
+            AssertThat(set1, Equals(CharacterSet({ {'a', 'c'}, {'t', 'z'} })));
+        });
+        
+        it("works for sets that overlap", []() {
+            CharacterSet set1({ {'a','s'} }, true);
+            set1.remove_set(CharacterSet({ {'m','z'} }, true));
+            AssertThat(set1, Equals(CharacterSet({ {'a', 'l'} }, true)));
+
+            CharacterSet set2({ {'m','z'} }, true);
+            set2.remove_set(CharacterSet({ {'a','s'} }, true));
+            AssertThat(set2, Equals(CharacterSet({ {'t', 'z'} }, true)));
+        });
+        
+        it("works for sets with multiple ranges", []() {
+            CharacterSet set1({ {'a','d'}, {'m', 'z'} });
+            set1.remove_set(CharacterSet({ {'c','o'}, {'s','x'} }));
+            AssertThat(set1, Equals(CharacterSet({ {'a', 'b'}, {'p','r'}, {'y','z'} })));
+        });
+    });
 });

 END_TEST
--- a/spec/fixtures/parsers/arithmetic.c
+++ b/spec/fixtures/parsers/arithmetic.c
@ -5,12 +5,12 @@ enum ts_symbol {
    ts_symbol_factor,
    ts_aux_token1,
    ts_symbol_plus,
+    ts_aux_token2,
    ts_symbol_number,
    ts_symbol_times,
-    ts_symbol_expression,
-    ts_symbol_variable,
-    ts_aux_token2,
    ts_symbol_term,
+    ts_symbol_variable,
+    ts_symbol_expression,
    ts_symbol___END__,
 };

@ -18,12 +18,12 @@ static const char *ts_symbol_names[] = {
    "factor",
    "token1",
    "plus",
+    "token2",
    "number",
    "times",
-    "expression",
-    "variable",
-    "token2",
    "term",
+    "variable",
+    "expression",
    "__END__",
 };

@ -73,11 +73,11 @@ static void ts_lex(TSParser *parser) {
                ADVANCE(8);
            LEX_ERROR(2, EXPECT({")", "+"}));
        case 10:
-            if ((LOOKAHEAD_CHAR() == '('))
-                ADVANCE(12);
            if (('A' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'Z') ||
                ('a' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= 'z'))
                ADVANCE(13);
+            if ((LOOKAHEAD_CHAR() == '('))
+                ADVANCE(12);
            if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
                ADVANCE(11);
            LEX_ERROR(4, EXPECT({"(", "0-9", "A-Z", "a-z"}));
@ -124,10 +124,10 @@ static TSParseResult ts_parse(const char *input) {
                    SHIFT(42);
                case ts_symbol_number:
                    SHIFT(41);
-                case ts_symbol_variable:
-                    SHIFT(41);
                case ts_symbol_term:
                    SHIFT(2);
+                case ts_symbol_variable:
+                    SHIFT(41);
                case ts_symbol_expression:
                    SHIFT(1);
                default:
@ -190,12 +190,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(32);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(32);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -252,12 +252,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(23);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(23);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -282,12 +282,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(14);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(14);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -356,12 +356,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(20);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(20);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -452,12 +452,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(29);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(29);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -544,12 +544,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(38);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(38);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -598,12 +598,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(43);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(43);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
@ -634,10 +634,10 @@ static TSParseResult ts_parse(const char *input) {
        case 45:
            SET_LEX_STATE(15);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_plus:
-                    REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
                case ts_symbol___END__:
                    REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
+                case ts_symbol_plus:
+                    REDUCE(ts_symbol_term, 1, COLLAPSE({0}));
                case ts_symbol_times:
                    SHIFT(46);
                default:
@ -672,12 +672,12 @@ static TSParseResult ts_parse(const char *input) {
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_factor:
                    SHIFT(16);
-                case ts_symbol_expression:
-                    SHIFT(49);
                case ts_aux_token1:
                    SHIFT(13);
                case ts_symbol_number:
                    SHIFT(12);
+                case ts_symbol_expression:
+                    SHIFT(49);
                case ts_symbol_variable:
                    SHIFT(12);
                case ts_symbol_term:
--- a/spec/fixtures/parsers/json.c
+++ b/spec/fixtures/parsers/json.c
@ -4,36 +4,36 @@
 enum ts_symbol {
    ts_aux_token6,
    ts_symbol_number,
-    ts_symbol_string,
-    ts_symbol_array,
-    ts_symbol_value,
-    ts_aux_token5,
-    ts_aux_repeat_helper1,
-    ts_aux_repeat_helper2,
-    ts_aux_token4,
-    ts_aux_token3,
-    ts_aux_token1,
    ts_symbol_object,
-    ts_aux_token2,
+    ts_aux_token5,
    ts_aux_token7,
+    ts_aux_token4,
+    ts_aux_repeat_helper2,
+    ts_aux_token1,
+    ts_aux_token3,
+    ts_symbol_value,
+    ts_symbol_string,
+    ts_aux_token2,
+    ts_symbol_array,
+    ts_aux_repeat_helper1,
    ts_symbol___END__,
 };

 static const char *ts_symbol_names[] = {
    "token6",
    "number",
-    "string",
-    "array",
-    "value",
-    "token5",
-    "repeat_helper1",
-    "repeat_helper2",
-    "token4",
-    "token3",
-    "token1",
    "object",
-    "token2",
+    "token5",
    "token7",
+    "token4",
+    "repeat_helper2",
+    "token1",
+    "token3",
+    "value",
+    "string",
+    "token2",
+    "array",
+    "repeat_helper1",
    "__END__",
 };

@ -79,10 +79,10 @@ static void ts_lex(TSParser *parser) {
        case 10:
            if ((LOOKAHEAD_CHAR() == '{'))
                ADVANCE(16);
-            if ((LOOKAHEAD_CHAR() == '\"'))
-                ADVANCE(12);
            if ((LOOKAHEAD_CHAR() == '['))
                ADVANCE(15);
+            if ((LOOKAHEAD_CHAR() == '\"'))
+                ADVANCE(12);
            if (('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9'))
                ADVANCE(11);
            LEX_ERROR(4, EXPECT({"\"", "0-9", "[", "{"}));
@ -128,14 +128,14 @@ static TSParseResult ts_parse(const char *input) {
        case 0:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(53);
                case ts_symbol_string:
                    SHIFT(53);
                case ts_symbol_array:
                    SHIFT(53);
                case ts_symbol_object:
                    SHIFT(53);
+                case ts_symbol_number:
+                    SHIFT(53);
                case ts_aux_token5:
                    SHIFT(47);
                case ts_aux_token1:
@ -156,14 +156,14 @@ static TSParseResult ts_parse(const char *input) {
        case 2:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_object:
                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
                case ts_aux_token5:
                    SHIFT(12);
                case ts_symbol_value:
@ -176,14 +176,14 @@ static TSParseResult ts_parse(const char *input) {
        case 3:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_object:
                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
                case ts_aux_token5:
                    SHIFT(12);
                case ts_symbol_value:
@ -226,18 +226,18 @@ static TSParseResult ts_parse(const char *input) {
        case 7:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(43);
                case ts_symbol_string:
                    SHIFT(43);
-                case ts_aux_token5:
-                    SHIFT(35);
-                case ts_symbol_object:
-                    SHIFT(43);
                case ts_symbol_array:
                    SHIFT(43);
                case ts_symbol_value:
                    SHIFT(41);
+                case ts_symbol_object:
+                    SHIFT(43);
+                case ts_symbol_number:
+                    SHIFT(43);
+                case ts_aux_token5:
+                    SHIFT(35);
                case ts_aux_token1:
                    SHIFT(8);
                default:
@ -246,14 +246,14 @@ static TSParseResult ts_parse(const char *input) {
        case 8:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_object:
                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
                case ts_aux_token5:
                    SHIFT(12);
                case ts_symbol_value:
@ -312,18 +312,18 @@ static TSParseResult ts_parse(const char *input) {
        case 14:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
-                case ts_aux_token5:
-                    SHIFT(12);
-                case ts_symbol_object:
-                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_value:
                    SHIFT(15);
+                case ts_symbol_object:
+                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
+                case ts_aux_token5:
+                    SHIFT(12);
                case ts_aux_token1:
                    SHIFT(3);
                default:
@ -378,18 +378,18 @@ static TSParseResult ts_parse(const char *input) {
        case 20:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(34);
                case ts_symbol_string:
                    SHIFT(34);
-                case ts_aux_token5:
-                    SHIFT(26);
-                case ts_symbol_object:
-                    SHIFT(34);
                case ts_symbol_array:
                    SHIFT(34);
                case ts_symbol_value:
                    SHIFT(32);
+                case ts_symbol_object:
+                    SHIFT(34);
+                case ts_symbol_number:
+                    SHIFT(34);
+                case ts_aux_token5:
+                    SHIFT(26);
                case ts_aux_token1:
                    SHIFT(21);
                default:
@ -398,14 +398,14 @@ static TSParseResult ts_parse(const char *input) {
        case 21:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_object:
                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
                case ts_aux_token5:
                    SHIFT(12);
                case ts_symbol_value:
@ -474,18 +474,18 @@ static TSParseResult ts_parse(const char *input) {
        case 28:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
-                case ts_aux_token5:
-                    SHIFT(12);
-                case ts_symbol_object:
-                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_value:
                    SHIFT(29);
+                case ts_symbol_object:
+                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
+                case ts_aux_token5:
+                    SHIFT(12);
                case ts_aux_token1:
                    SHIFT(3);
                default:
@ -524,10 +524,10 @@ static TSParseResult ts_parse(const char *input) {
        case 32:
            SET_LEX_STATE(9);
            switch (LOOKAHEAD_SYM()) {
-                case ts_aux_token7:
-                    REDUCE(ts_aux_repeat_helper1, 4, COLLAPSE({1, 0, 1, 0}));
                case ts_aux_token2:
                    SHIFT(18);
+                case ts_aux_token7:
+                    REDUCE(ts_aux_repeat_helper1, 4, COLLAPSE({1, 0, 1, 0}));
                case ts_aux_repeat_helper1:
                    SHIFT(33);
                default:
@ -570,18 +570,18 @@ static TSParseResult ts_parse(const char *input) {
        case 37:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
-                case ts_aux_token5:
-                    SHIFT(12);
-                case ts_symbol_object:
-                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_value:
                    SHIFT(38);
+                case ts_symbol_object:
+                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
+                case ts_aux_token5:
+                    SHIFT(12);
                case ts_aux_token1:
                    SHIFT(3);
                default:
@ -620,10 +620,10 @@ static TSParseResult ts_parse(const char *input) {
        case 41:
            SET_LEX_STATE(6);
            switch (LOOKAHEAD_SYM()) {
-                case ts_aux_token2:
-                    SHIFT(7);
                case ts_aux_token4:
                    REDUCE(ts_aux_repeat_helper2, 2, COLLAPSE({1, 0}));
+                case ts_aux_token2:
+                    SHIFT(7);
                case ts_aux_repeat_helper2:
                    SHIFT(42);
                default:
@ -694,18 +694,18 @@ static TSParseResult ts_parse(const char *input) {
        case 49:
            SET_LEX_STATE(10);
            switch (LOOKAHEAD_SYM()) {
-                case ts_symbol_number:
-                    SHIFT(25);
                case ts_symbol_string:
                    SHIFT(25);
-                case ts_aux_token5:
-                    SHIFT(12);
-                case ts_symbol_object:
-                    SHIFT(25);
                case ts_symbol_array:
                    SHIFT(25);
                case ts_symbol_value:
                    SHIFT(50);
+                case ts_symbol_object:
+                    SHIFT(25);
+                case ts_symbol_number:
+                    SHIFT(25);
+                case ts_aux_token5:
+                    SHIFT(12);
                case ts_aux_token1:
                    SHIFT(3);
                default:
--- a/src/compiler/generate_code/c_code.cpp
+++ b/src/compiler/generate_code/c_code.cpp
@ -162,14 +162,14 @@ namespace tree_sitter {
            string lex_error_call(const unordered_set<rules::CharacterSet> &expected_inputs) {
                rules::CharacterSet expected_set;
                for (auto &rule : expected_inputs)
-                    expected_set.union_with(rule);
+                    expected_set.add_set(rule);
                
                string result = "LEX_ERROR(" + to_string(expected_set.ranges.size()) + ", EXPECT({";
                bool started = false;
-                for (auto &ranges : expected_set.ranges) {
+                for (auto &range : expected_set.ranges) {
                    if (started) result += ", ";
                    started = true;
-                    result += "\"" + escape_string(ranges.to_string()) + "\"";
+                    result += "\"" + escape_string(range.to_string()) + "\"";
                }
                result += "}));";
                return result;
--- a/src/compiler/rules/character_set.cpp
+++ b/src/compiler/rules/character_set.cpp
@ -3,6 +3,7 @@
 using std::string;
 using std::hash;
 using std::set;
+using std::pair;

 namespace tree_sitter  {
    namespace rules {
@ -36,23 +37,12 @@ namespace tree_sitter  {
            }
        }
        
-        int CharacterRange::max_int() const {
-            return max == MAX_CHAR ? 255 : (int)max;
+        int max_int(const CharacterRange &range) {
+            return range.max == MAX_CHAR ? 255 : (int)range.max;
        }
        
-        int CharacterRange::min_int() const {
-            return (int)min;
-        }
-        
-        bool CharacterRange::is_adjacent(const CharacterRange &other) const {
-            return
-            (min_int() <= other.min_int() && max_int() >= (other.min_int() - 1)) || 
-            (min_int() <= (other.max_int() + 1) && max_int() >= other.max_int());
-        }
-        
-        void CharacterRange::add_range(const CharacterRange &other) {
-            if (other.min < min) min = other.min;
-            if (other.max_int() > max_int()) max = other.max;
+        int min_int(const CharacterRange &range) {
+            return (int)range.min;
        }
        
        string CharacterRange::to_string() const {
@ -101,7 +91,7 @@ namespace tree_sitter  {
                result.insert(CharacterRange(current_char, MAX_CHAR));
            return CharacterSet(result);
        }
-        
+                
        std::pair<CharacterSet, bool> CharacterSet::most_compact_representation() const {
            auto first_range = *ranges.begin();
            if (first_range.min == 0 && first_range.max > 0) {
@ -113,10 +103,26 @@ namespace tree_sitter  {
        
        void add_range(CharacterSet *self, CharacterRange new_range) {
            set<CharacterRange> new_ranges;
+
            for (auto range : self->ranges) {
-                if (range.is_adjacent(new_range)) {
-                    new_range.add_range(range);
-                } else {
+                auto new_min = min_int(new_range);
+                auto new_max = max_int(new_range);
+                bool is_adjacent = false;
+
+                if (min_int(range) < new_min) {
+                    if (max_int(range) >= new_min - 1) {
+                        is_adjacent = true;
+                        new_range.min = range.min;
+                    }
+                }
+                if (max_int(range) > new_max) {
+                    if (min_int(range) <= new_max + 1) {
+                        is_adjacent = true;
+                        new_range.max = range.max;
+                    }
+                }
+                
+                if (!is_adjacent) {
                    new_ranges.insert(range);
                }
            }
@ -124,12 +130,44 @@ namespace tree_sitter  {
            self->ranges = new_ranges;
        }
        
-        void CharacterSet::union_with(const CharacterSet &other) {
+        void remove_range(CharacterSet *self, CharacterRange new_range) {
+            set<CharacterRange> new_ranges;
+            auto new_min = min_int(new_range);
+            auto new_max = max_int(new_range);
+
+            for (auto range : self->ranges) {
+                if (new_min <= min_int(range)) {
+                    if (new_max < min_int(range)) {
+                        new_ranges.insert(range);
+                    } else if (new_max <= max_int(range)) {
+                        new_ranges.insert(CharacterRange(new_max + 1, range.max));
+                    }
+                } else if (new_min <= max_int(range)) {
+                    if (new_max < max_int(range)) {
+                        new_ranges.insert(CharacterRange(range.min, new_min - 1));
+                        new_ranges.insert(CharacterRange(new_max + 1, range.max));
+                    } else {
+                        new_ranges.insert(CharacterRange(range.min, new_min - 1));
+                    }
+                } else {
+                    new_ranges.insert(range);
+                }
+            }
+            self->ranges = new_ranges;
+        }
+        
+        void CharacterSet::add_set(const CharacterSet &other) {
            for (auto &other_range : other.ranges) {
                add_range(this, other_range);
            }
        }
        
+        void CharacterSet::remove_set(const CharacterSet &other) {
+            for (auto &other_range : other.ranges) {
+                remove_range(this, other_range);
+            }
+        }
+        
        void CharacterSet::accept(Visitor &visitor) const {
            visitor.visit(this);
        }
--- a/src/compiler/rules/character_set.h
+++ b/src/compiler/rules/character_set.h
@ -9,19 +9,10 @@ namespace tree_sitter  {
        struct CharacterRange {
            char min;
            char max;
-            
            CharacterRange(char);
            CharacterRange(char, char);
-            
-            int max_int() const;
-            int min_int() const;
-
            bool operator==(const CharacterRange &) const;
            bool operator<(const CharacterRange &) const;
-            bool is_adjacent(const CharacterRange &) const;
-            
-            void add_range(const CharacterRange &);
-            
            std::string to_string() const;
        };
    }
@ -45,8 +36,10 @@ namespace tree_sitter  {
            CharacterSet(const std::set<CharacterRange> &ranges, bool);
            
            CharacterSet complement() const;
-            void union_with(const CharacterSet &other);
            std::pair<CharacterSet, bool> most_compact_representation() const;
+
+            void add_set(const CharacterSet &other);
+            void remove_set(const CharacterSet &other);
            
            bool operator==(const Rule& other) const;
            size_t hash_code() const;
--- a/src/compiler/rules/pattern.cpp
+++ b/src/compiler/rules/pattern.cpp
@ -75,7 +75,7 @@ namespace tree_sitter {
                }
                CharacterSet result;
                while (has_more_input() && (peek() != ']'))
-                    result.union_with(single_char());
+                    result.add_set(single_char());
                return is_affirmative ? result : result.complement();
            }