Fix some regex parsing bugs

* Allow escape sequences to be used in ranges
* Don't give special meaning to dashes outside of character classes
This commit is contained in:
Max Brunsfeld 2018-04-06 12:46:06 -07:00
parent 345e344377
commit 1ca261c79b
2 changed files with 64 additions and 27 deletions

View file

@ -136,10 +136,7 @@ class PatternParser {
}
default: {
auto pair = single_char();
if (pair.second.type)
return { Blank{}, pair.second };
return {pair.first, CompileError::none()};
return {single_char(), CompileError::none()};
}
}
}
@ -154,38 +151,46 @@ class PatternParser {
}
while (has_more_input() && (peek() != ']')) {
auto pair = single_char();
if (pair.second.type)
return { CharacterSet(), pair.second };
auto characters = single_char();
if (peek() == '-') {
next();
if (!characters.includes_all && characters.included_chars.size() == 1 && peek() != ']') {
auto next_characters = single_char();
if (!next_characters.includes_all && next_characters.included_chars.size() == 1) {
characters.include(
*characters.included_chars.begin(),
*next_characters.included_chars.begin()
);
} else {
characters.include('-');
characters.add_set(next_characters);
}
} else {
characters.include('-');
}
}
if (is_affirmative)
result.add_set(pair.first);
result.add_set(characters);
else
result.remove_set(pair.first);
result.remove_set(characters);
}
return { result, CompileError::none() };
}
pair<CharacterSet, CompileError> single_char() {
CharacterSet single_char() {
CharacterSet value;
switch (peek()) {
case '\\':
next();
value = escaped_char(peek());
next();
break;
default:
uint32_t first_char = peek();
next();
if (peek() == '-') {
next();
value = CharacterSet().include(first_char, peek());
next();
} else {
value = CharacterSet().include(first_char);
}
if (peek() == '\\') {
next();
value = escaped_char(peek());
next();
} else {
value = CharacterSet().include(peek());
next();
}
return { value, CompileError::none() };
return value;
}
CharacterSet escaped_char(uint32_t value) {
@ -220,6 +225,8 @@ class PatternParser {
.exclude('\t')
.exclude('\n')
.exclude('\r');
case '0':
return CharacterSet().include('\0');
case 't':
return CharacterSet().include('\t');
case 'n':

View file

@ -110,6 +110,30 @@ describe("parse_regex", []() {
CharacterSet{{'\\'}}
},
{
"dashes",
"a-b",
Rule::seq({
CharacterSet{{'a'}},
CharacterSet{{'-'}},
CharacterSet{{'b'}}
})
},
{
"literal dashes in character classes",
"[a-][\\d-a][\\S-a]",
Rule::seq({
CharacterSet{{'a', '-'}},
CharacterSet().include('0', '9').include('-').include('a'),
CharacterSet().include_all()
.exclude(' ')
.exclude('\t')
.exclude('\r')
.exclude('\n')
})
},
{
"character groups in sequences",
"x([^x]|\\\\x)*x",
@ -171,6 +195,12 @@ describe("parse_regex", []() {
CharacterSet{{'[', '\\', ']'}}
},
{
"escaped characters in ranges",
"[\\0-\\n]",
CharacterSet().include(0, '\n')
},
{
"escaped periods",
"a\\.",