Avoid redundant character comparisons in generated lex function

This commit is contained in:
Max Brunsfeld 2017-07-10 14:09:31 -07:00
parent 2755b07222
commit 59236d2ed1
3 changed files with 101 additions and 74 deletions

View file

@ -401,52 +401,106 @@ class CCodeGenerator {
add_accept_token_action(lex_state.accept_action);
}
set<uint32_t> ruled_out_characters;
for (const auto &pair : lex_state.advance_actions) {
if (!pair.first.is_empty()) {
_if([&]() { add_character_set_condition(pair.first); },
[&]() { add_advance_action(pair.second); });
if (pair.first.is_empty()) continue;
size_t current_length = buffer.size();
line("if (");
if (add_character_set_condition(pair.first, ruled_out_characters)) {
add(")");
indent([&]() { add_advance_action(pair.second); });
ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end());
} else {
buffer.resize(current_length);
add_advance_action(pair.second);
}
}
line("END_STATE();");
}
void add_character_set_condition(const rules::CharacterSet &rule) {
bool add_character_set_condition(const rules::CharacterSet &rule, const set<uint32_t> &ruled_out_characters) {
if (rule.includes_all) {
add("!(");
add_character_range_conditions(rule.excluded_ranges());
add(")");
return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true);
} else {
add_character_range_conditions(rule.included_ranges());
return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false);
}
}
void add_character_range_conditions(const vector<rules::CharacterRange> &ranges) {
if (ranges.size() == 1) {
add_character_range_condition(*ranges.begin());
} else {
bool first = true;
for (const auto &range : ranges) {
if (!first) {
add(" ||");
line(" ");
bool add_character_range_conditions(const vector<rules::CharacterRange> &ranges,
const set<uint32_t> &ruled_out_characters,
bool is_negated) {
bool first = true;
for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) {
auto range = *iter;
bool range_is_ruled_out = true;
for (uint32_t c = range.min; c <= range.max; c++) {
if (!ruled_out_characters.count(c)) {
range_is_ruled_out = false;
break;
}
}
if (range_is_ruled_out) {
++iter;
continue;
}
auto next_iter = iter + 1;
while (next_iter != end) {
bool can_join_ranges = true;
for (uint32_t character = range.max + 1; character < next_iter->min; character++) {
if (!ruled_out_characters.count(character)) {
can_join_ranges = false;
break;
}
}
add("(");
add_character_range_condition(range);
add(")");
first = false;
if (can_join_ranges) {
range.max = next_iter->max;
++next_iter;
} else {
break;
}
}
if (!first) {
add(is_negated ? " &&" : " ||");
line(" ");
}
add_character_range_condition(range, is_negated);
first = false;
iter = next_iter;
}
return !first;
}
void add_character_range_condition(const rules::CharacterRange &range) {
if (range.min == range.max) {
add("lookahead == " + escape_char(range.min));
void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) {
auto min = escape_char(range.min);
auto max = escape_char(range.max);
if (is_negated) {
if (range.max == range.min) {
add("lookahead != " + min);
} else if (range.max == range.min + 1) {
add("lookahead != " + min + " &&");
line(" lookahead != " + max);
} else {
add("(lookahead < " + min + " || lookahead > " + max + ")");
}
} else {
add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") +
escape_char(range.max));
if (range.max == range.min) {
add("lookahead == " + min);
} else if (range.max == range.min + 1) {
add("lookahead == " + min + " ||");
line(" lookahead == " + max);
} else {
add("(" + min + " <= lookahead && lookahead <= " + max + ")");
}
}
}
@ -599,13 +653,6 @@ class CCodeGenerator {
indent(body);
}
void _if(function<void()> condition, function<void()> body) {
line("if (");
indent(condition);
add(")");
indent(body);
}
string sanitize_name_for_string(string name) {
util::str_replace(&name, "\\", "\\\\");
util::str_replace(&name, "\n", "\\n");

View file

@ -38,20 +38,11 @@ static set<uint32_t> add_chars(set<uint32_t> *left, const set<uint32_t> &right)
return result;
}
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &chars) {
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &characters) {
vector<CharacterRange> result;
for (uint32_t c : chars) {
auto size = result.size();
if (size >= 2 && result[size - 2].max == (c - 2)) {
result.pop_back();
for (uint32_t c : characters) {
if (!result.empty() && result.back().max == c - 1) {
result.back().max = c;
} else if (size >= 1) {
CharacterRange &last = result.back();
if (last.min < last.max && last.max == (c - 1)) {
last.max = c;
} else {
result.push_back(CharacterRange(c));
}
} else {
result.push_back(CharacterRange(c));
}
@ -70,15 +61,17 @@ bool CharacterSet::operator==(const CharacterSet &other) const {
}
bool CharacterSet::operator<(const CharacterSet &other) const {
if (!includes_all && other.includes_all)
return true;
if (includes_all && !other.includes_all)
return false;
if (included_chars < other.included_chars)
return true;
if (other.included_chars < included_chars)
return false;
return excluded_chars < other.excluded_chars;
if (!includes_all && other.includes_all) return true;
if (includes_all && !other.includes_all) return false;
if (includes_all) {
if (excluded_chars.size() > other.excluded_chars.size()) return true;
if (excluded_chars.size() < other.excluded_chars.size()) return false;
return excluded_chars < other.excluded_chars;
} else {
if (included_chars.size() < other.included_chars.size()) return true;
if (included_chars.size() > other.included_chars.size()) return false;
return included_chars < other.included_chars;
}
}
CharacterSet &CharacterSet::include_all() {
@ -131,8 +124,7 @@ void CharacterSet::add_set(const CharacterSet &other) {
excluded_chars.insert(c);
included_chars.clear();
} else {
for (uint32_t c : other.included_chars)
included_chars.insert(c);
included_chars.insert(other.included_chars.begin(), other.included_chars.end());
}
}
}

View file

@ -305,29 +305,17 @@ describe("CharacterSet", []() {
});
describe("::included_ranges", [&]() {
it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
it("consolidates consecutive sequences of characters into ranges", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'c')
.include('g')
.include('e', 'j')
.include('m')
.include('z');
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
CharacterRange{'a', 'c'},
CharacterRange('g'),
CharacterRange('z'),
})));
});
it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
CharacterSet set1 = CharacterSet()
.include('a', 'b')
.include('g')
.include('z');
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
CharacterRange('a'),
CharacterRange('b'),
CharacterRange('g'),
CharacterRange{'e', 'j'},
CharacterRange('m'),
CharacterRange('z'),
})));
});