Avoid redundant character comparisons in generated lex function
This commit is contained in:
parent
2755b07222
commit
59236d2ed1
3 changed files with 101 additions and 74 deletions
|
|
@ -401,52 +401,106 @@ class CCodeGenerator {
|
|||
add_accept_token_action(lex_state.accept_action);
|
||||
}
|
||||
|
||||
set<uint32_t> ruled_out_characters;
|
||||
for (const auto &pair : lex_state.advance_actions) {
|
||||
if (!pair.first.is_empty()) {
|
||||
_if([&]() { add_character_set_condition(pair.first); },
|
||||
[&]() { add_advance_action(pair.second); });
|
||||
if (pair.first.is_empty()) continue;
|
||||
|
||||
size_t current_length = buffer.size();
|
||||
|
||||
line("if (");
|
||||
if (add_character_set_condition(pair.first, ruled_out_characters)) {
|
||||
add(")");
|
||||
indent([&]() { add_advance_action(pair.second); });
|
||||
ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end());
|
||||
} else {
|
||||
buffer.resize(current_length);
|
||||
add_advance_action(pair.second);
|
||||
}
|
||||
}
|
||||
|
||||
line("END_STATE();");
|
||||
}
|
||||
|
||||
void add_character_set_condition(const rules::CharacterSet &rule) {
|
||||
bool add_character_set_condition(const rules::CharacterSet &rule, const set<uint32_t> &ruled_out_characters) {
|
||||
if (rule.includes_all) {
|
||||
add("!(");
|
||||
add_character_range_conditions(rule.excluded_ranges());
|
||||
add(")");
|
||||
return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true);
|
||||
} else {
|
||||
add_character_range_conditions(rule.included_ranges());
|
||||
return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false);
|
||||
}
|
||||
}
|
||||
|
||||
void add_character_range_conditions(const vector<rules::CharacterRange> &ranges) {
|
||||
if (ranges.size() == 1) {
|
||||
add_character_range_condition(*ranges.begin());
|
||||
} else {
|
||||
bool first = true;
|
||||
for (const auto &range : ranges) {
|
||||
if (!first) {
|
||||
add(" ||");
|
||||
line(" ");
|
||||
bool add_character_range_conditions(const vector<rules::CharacterRange> &ranges,
|
||||
const set<uint32_t> &ruled_out_characters,
|
||||
bool is_negated) {
|
||||
bool first = true;
|
||||
for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) {
|
||||
auto range = *iter;
|
||||
|
||||
bool range_is_ruled_out = true;
|
||||
for (uint32_t c = range.min; c <= range.max; c++) {
|
||||
if (!ruled_out_characters.count(c)) {
|
||||
range_is_ruled_out = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (range_is_ruled_out) {
|
||||
++iter;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto next_iter = iter + 1;
|
||||
while (next_iter != end) {
|
||||
bool can_join_ranges = true;
|
||||
for (uint32_t character = range.max + 1; character < next_iter->min; character++) {
|
||||
if (!ruled_out_characters.count(character)) {
|
||||
can_join_ranges = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
add("(");
|
||||
add_character_range_condition(range);
|
||||
add(")");
|
||||
|
||||
first = false;
|
||||
if (can_join_ranges) {
|
||||
range.max = next_iter->max;
|
||||
++next_iter;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!first) {
|
||||
add(is_negated ? " &&" : " ||");
|
||||
line(" ");
|
||||
}
|
||||
|
||||
add_character_range_condition(range, is_negated);
|
||||
first = false;
|
||||
iter = next_iter;
|
||||
}
|
||||
|
||||
return !first;
|
||||
}
|
||||
|
||||
void add_character_range_condition(const rules::CharacterRange &range) {
|
||||
if (range.min == range.max) {
|
||||
add("lookahead == " + escape_char(range.min));
|
||||
void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) {
|
||||
auto min = escape_char(range.min);
|
||||
auto max = escape_char(range.max);
|
||||
if (is_negated) {
|
||||
if (range.max == range.min) {
|
||||
add("lookahead != " + min);
|
||||
} else if (range.max == range.min + 1) {
|
||||
add("lookahead != " + min + " &&");
|
||||
line(" lookahead != " + max);
|
||||
} else {
|
||||
add("(lookahead < " + min + " || lookahead > " + max + ")");
|
||||
}
|
||||
} else {
|
||||
add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") +
|
||||
escape_char(range.max));
|
||||
if (range.max == range.min) {
|
||||
add("lookahead == " + min);
|
||||
} else if (range.max == range.min + 1) {
|
||||
add("lookahead == " + min + " ||");
|
||||
line(" lookahead == " + max);
|
||||
} else {
|
||||
add("(" + min + " <= lookahead && lookahead <= " + max + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -599,13 +653,6 @@ class CCodeGenerator {
|
|||
indent(body);
|
||||
}
|
||||
|
||||
void _if(function<void()> condition, function<void()> body) {
|
||||
line("if (");
|
||||
indent(condition);
|
||||
add(")");
|
||||
indent(body);
|
||||
}
|
||||
|
||||
string sanitize_name_for_string(string name) {
|
||||
util::str_replace(&name, "\\", "\\\\");
|
||||
util::str_replace(&name, "\n", "\\n");
|
||||
|
|
|
|||
|
|
@ -38,20 +38,11 @@ static set<uint32_t> add_chars(set<uint32_t> *left, const set<uint32_t> &right)
|
|||
return result;
|
||||
}
|
||||
|
||||
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &chars) {
|
||||
static vector<CharacterRange> consolidate_ranges(const set<uint32_t> &characters) {
|
||||
vector<CharacterRange> result;
|
||||
for (uint32_t c : chars) {
|
||||
auto size = result.size();
|
||||
if (size >= 2 && result[size - 2].max == (c - 2)) {
|
||||
result.pop_back();
|
||||
for (uint32_t c : characters) {
|
||||
if (!result.empty() && result.back().max == c - 1) {
|
||||
result.back().max = c;
|
||||
} else if (size >= 1) {
|
||||
CharacterRange &last = result.back();
|
||||
if (last.min < last.max && last.max == (c - 1)) {
|
||||
last.max = c;
|
||||
} else {
|
||||
result.push_back(CharacterRange(c));
|
||||
}
|
||||
} else {
|
||||
result.push_back(CharacterRange(c));
|
||||
}
|
||||
|
|
@ -70,15 +61,17 @@ bool CharacterSet::operator==(const CharacterSet &other) const {
|
|||
}
|
||||
|
||||
bool CharacterSet::operator<(const CharacterSet &other) const {
|
||||
if (!includes_all && other.includes_all)
|
||||
return true;
|
||||
if (includes_all && !other.includes_all)
|
||||
return false;
|
||||
if (included_chars < other.included_chars)
|
||||
return true;
|
||||
if (other.included_chars < included_chars)
|
||||
return false;
|
||||
return excluded_chars < other.excluded_chars;
|
||||
if (!includes_all && other.includes_all) return true;
|
||||
if (includes_all && !other.includes_all) return false;
|
||||
if (includes_all) {
|
||||
if (excluded_chars.size() > other.excluded_chars.size()) return true;
|
||||
if (excluded_chars.size() < other.excluded_chars.size()) return false;
|
||||
return excluded_chars < other.excluded_chars;
|
||||
} else {
|
||||
if (included_chars.size() < other.included_chars.size()) return true;
|
||||
if (included_chars.size() > other.included_chars.size()) return false;
|
||||
return included_chars < other.included_chars;
|
||||
}
|
||||
}
|
||||
|
||||
CharacterSet &CharacterSet::include_all() {
|
||||
|
|
@ -131,8 +124,7 @@ void CharacterSet::add_set(const CharacterSet &other) {
|
|||
excluded_chars.insert(c);
|
||||
included_chars.clear();
|
||||
} else {
|
||||
for (uint32_t c : other.included_chars)
|
||||
included_chars.insert(c);
|
||||
included_chars.insert(other.included_chars.begin(), other.included_chars.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -305,29 +305,17 @@ describe("CharacterSet", []() {
|
|||
});
|
||||
|
||||
describe("::included_ranges", [&]() {
|
||||
it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() {
|
||||
it("consolidates consecutive sequences of characters into ranges", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'c')
|
||||
.include('g')
|
||||
.include('e', 'j')
|
||||
.include('m')
|
||||
.include('z');
|
||||
|
||||
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
|
||||
CharacterRange{'a', 'c'},
|
||||
CharacterRange('g'),
|
||||
CharacterRange('z'),
|
||||
})));
|
||||
});
|
||||
|
||||
it("doesn't consolidate sequences of 2 consecutive characters", [&]() {
|
||||
CharacterSet set1 = CharacterSet()
|
||||
.include('a', 'b')
|
||||
.include('g')
|
||||
.include('z');
|
||||
|
||||
AssertThat(set1.included_ranges(), Equals(vector<CharacterRange>({
|
||||
CharacterRange('a'),
|
||||
CharacterRange('b'),
|
||||
CharacterRange('g'),
|
||||
CharacterRange{'e', 'j'},
|
||||
CharacterRange('m'),
|
||||
CharacterRange('z'),
|
||||
})));
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue