diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b2359233..1d5ff690 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -401,52 +401,106 @@ class CCodeGenerator { add_accept_token_action(lex_state.accept_action); } + set ruled_out_characters; for (const auto &pair : lex_state.advance_actions) { - if (!pair.first.is_empty()) { - _if([&]() { add_character_set_condition(pair.first); }, - [&]() { add_advance_action(pair.second); }); + if (pair.first.is_empty()) continue; + + size_t current_length = buffer.size(); + + line("if ("); + if (add_character_set_condition(pair.first, ruled_out_characters)) { + add(")"); + indent([&]() { add_advance_action(pair.second); }); + ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end()); + } else { + buffer.resize(current_length); + add_advance_action(pair.second); } } line("END_STATE();"); } - void add_character_set_condition(const rules::CharacterSet &rule) { + bool add_character_set_condition(const rules::CharacterSet &rule, const set &ruled_out_characters) { if (rule.includes_all) { - add("!("); - add_character_range_conditions(rule.excluded_ranges()); - add(")"); + return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true); } else { - add_character_range_conditions(rule.included_ranges()); + return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false); } } - void add_character_range_conditions(const vector &ranges) { - if (ranges.size() == 1) { - add_character_range_condition(*ranges.begin()); - } else { - bool first = true; - for (const auto &range : ranges) { - if (!first) { - add(" ||"); - line(" "); + bool add_character_range_conditions(const vector &ranges, + const set &ruled_out_characters, + bool is_negated) { + bool first = true; + for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) { + auto range = *iter; + + bool range_is_ruled_out = true; + for (uint32_t c = range.min; c <= range.max; c++) { + if (!ruled_out_characters.count(c)) { + range_is_ruled_out = false; + break; + } + } + + if (range_is_ruled_out) { + ++iter; + continue; + } + + auto next_iter = iter + 1; + while (next_iter != end) { + bool can_join_ranges = true; + for (uint32_t character = range.max + 1; character < next_iter->min; character++) { + if (!ruled_out_characters.count(character)) { + can_join_ranges = false; + break; + } } - add("("); - add_character_range_condition(range); - add(")"); - - first = false; + if (can_join_ranges) { + range.max = next_iter->max; + ++next_iter; + } else { + break; + } } + + if (!first) { + add(is_negated ? " &&" : " ||"); + line(" "); + } + + add_character_range_condition(range, is_negated); + first = false; + iter = next_iter; } + + return !first; } - void add_character_range_condition(const rules::CharacterRange &range) { - if (range.min == range.max) { - add("lookahead == " + escape_char(range.min)); + void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) { + auto min = escape_char(range.min); + auto max = escape_char(range.max); + if (is_negated) { + if (range.max == range.min) { + add("lookahead != " + min); + } else if (range.max == range.min + 1) { + add("lookahead != " + min + " &&"); + line(" lookahead != " + max); + } else { + add("(lookahead < " + min + " || lookahead > " + max + ")"); + } } else { - add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") + - escape_char(range.max)); + if (range.max == range.min) { + add("lookahead == " + min); + } else if (range.max == range.min + 1) { + add("lookahead == " + min + " ||"); + line(" lookahead == " + max); + } else { + add("(" + min + " <= lookahead && lookahead <= " + max + ")"); + } } } @@ -599,13 +653,6 @@ class CCodeGenerator { indent(body); } - void _if(function condition, function body) { - line("if ("); - indent(condition); - add(")"); - indent(body); - } - string sanitize_name_for_string(string name) { util::str_replace(&name, "\\", "\\\\"); util::str_replace(&name, "\n", "\\n"); diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index 5b0c3464..c199368d 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -38,20 +38,11 @@ static set add_chars(set *left, const set &right) return result; } -static vector consolidate_ranges(const set &chars) { +static vector consolidate_ranges(const set &characters) { vector result; - for (uint32_t c : chars) { - auto size = result.size(); - if (size >= 2 && result[size - 2].max == (c - 2)) { - result.pop_back(); + for (uint32_t c : characters) { + if (!result.empty() && result.back().max == c - 1) { result.back().max = c; - } else if (size >= 1) { - CharacterRange &last = result.back(); - if (last.min < last.max && last.max == (c - 1)) { - last.max = c; - } else { - result.push_back(CharacterRange(c)); - } } else { result.push_back(CharacterRange(c)); } @@ -70,15 +61,17 @@ bool CharacterSet::operator==(const CharacterSet &other) const { } bool CharacterSet::operator<(const CharacterSet &other) const { - if (!includes_all && other.includes_all) - return true; - if (includes_all && !other.includes_all) - return false; - if (included_chars < other.included_chars) - return true; - if (other.included_chars < included_chars) - return false; - return excluded_chars < other.excluded_chars; + if (!includes_all && other.includes_all) return true; + if (includes_all && !other.includes_all) return false; + if (includes_all) { + if (excluded_chars.size() > other.excluded_chars.size()) return true; + if (excluded_chars.size() < other.excluded_chars.size()) return false; + return excluded_chars < other.excluded_chars; + } else { + if (included_chars.size() < other.included_chars.size()) return true; + if (included_chars.size() > other.included_chars.size()) return false; + return included_chars < other.included_chars; + } } CharacterSet &CharacterSet::include_all() { @@ -131,8 +124,7 @@ void CharacterSet::add_set(const CharacterSet &other) { excluded_chars.insert(c); included_chars.clear(); } else { - for (uint32_t c : other.included_chars) - included_chars.insert(c); + included_chars.insert(other.included_chars.begin(), other.included_chars.end()); } } } diff --git a/test/compiler/rules/character_set_test.cc b/test/compiler/rules/character_set_test.cc index f7c2e632..dfe67604 100644 --- a/test/compiler/rules/character_set_test.cc +++ b/test/compiler/rules/character_set_test.cc @@ -305,29 +305,17 @@ describe("CharacterSet", []() { }); describe("::included_ranges", [&]() { - it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() { + it("consolidates consecutive sequences of characters into ranges", [&]() { CharacterSet set1 = CharacterSet() .include('a', 'c') - .include('g') + .include('e', 'j') + .include('m') .include('z'); AssertThat(set1.included_ranges(), Equals(vector({ CharacterRange{'a', 'c'}, - CharacterRange('g'), - CharacterRange('z'), - }))); - }); - - it("doesn't consolidate sequences of 2 consecutive characters", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'b') - .include('g') - .include('z'); - - AssertThat(set1.included_ranges(), Equals(vector({ - CharacterRange('a'), - CharacterRange('b'), - CharacterRange('g'), + CharacterRange{'e', 'j'}, + CharacterRange('m'), CharacterRange('z'), }))); });