Support {} quantifier syntax in regexes
This commit is contained in:
parent
16376c43f5
commit
e88dd223b2
2 changed files with 141 additions and 0 deletions
|
|
@ -1,6 +1,7 @@
|
|||
#include "compiler/prepare_grammar/parse_regex.h"
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <cwctype>
|
||||
#include <vector>
|
||||
#include "compiler/rule.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
|
|
@ -12,6 +13,7 @@ namespace prepare_grammar {
|
|||
using std::string;
|
||||
using std::vector;
|
||||
using std::pair;
|
||||
using std::iswdigit;
|
||||
using rules::CharacterSet;
|
||||
using rules::Blank;
|
||||
using rules::Rule;
|
||||
|
|
@ -85,6 +87,56 @@ class PatternParser {
|
|||
next();
|
||||
result = Rule::choice({result, Blank{}});
|
||||
break;
|
||||
case '{': {
|
||||
Checkpoint checkpoint = get_checkpoint();
|
||||
next();
|
||||
|
||||
string min_repeat_string;
|
||||
while (iswdigit(peek())) {
|
||||
min_repeat_string += (char)peek();
|
||||
next();
|
||||
}
|
||||
|
||||
bool has_comma = false;
|
||||
string max_repeat_string;
|
||||
if (peek() == ',') {
|
||||
next();
|
||||
has_comma = true;
|
||||
while (iswdigit(peek())) {
|
||||
max_repeat_string += (char)peek();
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
if (peek() == '}' && (!min_repeat_string.empty() || has_comma)) {
|
||||
next();
|
||||
if (min_repeat_string.size()) {
|
||||
unsigned min_count = std::stoi(min_repeat_string);
|
||||
vector<Rule> entries(min_count, result);
|
||||
if (max_repeat_string.size()) {
|
||||
unsigned max_count = std::stoi(max_repeat_string);
|
||||
if (max_count < min_count) {
|
||||
return error("numbers out of order in {} quantifier");
|
||||
}
|
||||
vector<Rule> optional_entries(max_count - min_count, Rule::choice({result, Blank{}}));
|
||||
entries.insert(entries.end(), optional_entries.begin(), optional_entries.end());
|
||||
} else if (has_comma) {
|
||||
entries.push_back(Rule::repeat(result));
|
||||
}
|
||||
result = Rule::seq(entries);
|
||||
} else if (max_repeat_string.size()) {
|
||||
unsigned max_count = std::stoi(max_repeat_string);
|
||||
vector<Rule> optional_entries(max_count, Rule::choice({result, Blank{}}));
|
||||
result = Rule::seq(optional_entries);
|
||||
} else {
|
||||
result = Rule::repeat(result);
|
||||
}
|
||||
} else {
|
||||
revert(checkpoint);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -245,6 +297,20 @@ class PatternParser {
|
|||
iter += lookahead_size;
|
||||
}
|
||||
|
||||
struct Checkpoint {
|
||||
const uint8_t *iter;
|
||||
int32_t lookahead;
|
||||
};
|
||||
|
||||
Checkpoint get_checkpoint() {
|
||||
return Checkpoint{iter, lookahead};
|
||||
}
|
||||
|
||||
void revert(Checkpoint checkpoint) {
|
||||
iter = checkpoint.iter;
|
||||
lookahead = checkpoint.lookahead;
|
||||
}
|
||||
|
||||
uint32_t peek() {
|
||||
return lookahead;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -274,6 +274,76 @@ describe("parse_regex", []() {
|
|||
CharacterSet{{'/'}},
|
||||
}),
|
||||
},
|
||||
|
||||
{
|
||||
"characters with quantifiers",
|
||||
"a{3}",
|
||||
Rule::seq({
|
||||
CharacterSet{{'a'}},
|
||||
CharacterSet{{'a'}},
|
||||
CharacterSet{{'a'}},
|
||||
}),
|
||||
},
|
||||
|
||||
{
|
||||
"character classes with quantifiers",
|
||||
"[a-f]{3}",
|
||||
Rule::seq({
|
||||
CharacterSet().include('a', 'f'),
|
||||
CharacterSet().include('a', 'f'),
|
||||
CharacterSet().include('a', 'f'),
|
||||
}),
|
||||
},
|
||||
|
||||
{
|
||||
"characters with open range quantifiers",
|
||||
"a{,} b{1,} c{,2}",
|
||||
Rule::seq({
|
||||
Rule::seq({
|
||||
Repeat{CharacterSet{{'a'}}},
|
||||
}),
|
||||
CharacterSet{{' '}},
|
||||
Rule::seq({
|
||||
CharacterSet{{'b'}},
|
||||
Repeat{CharacterSet{{'b'}}},
|
||||
}),
|
||||
CharacterSet{{' '}},
|
||||
Rule::seq({
|
||||
Rule::choice({CharacterSet{{'c'}}, Blank{}}),
|
||||
Rule::choice({CharacterSet{{'c'}}, Blank{}}),
|
||||
}),
|
||||
}),
|
||||
},
|
||||
|
||||
{
|
||||
"characters with closed range quantifiers",
|
||||
"a{2,4}",
|
||||
Rule::seq({
|
||||
CharacterSet{{'a'}},
|
||||
CharacterSet{{'a'}},
|
||||
Rule::choice({CharacterSet{{'a'}}, Blank{}}),
|
||||
Rule::choice({CharacterSet{{'a'}}, Blank{}}),
|
||||
}),
|
||||
},
|
||||
|
||||
{
|
||||
"curly braces that aren't quantifiers",
|
||||
"a{1b} c{2,d}",
|
||||
Rule::seq({
|
||||
CharacterSet{{'a'}},
|
||||
CharacterSet{{'{'}},
|
||||
CharacterSet{{'1'}},
|
||||
CharacterSet{{'b'}},
|
||||
CharacterSet{{'}'}},
|
||||
CharacterSet{{' '}},
|
||||
CharacterSet{{'c'}},
|
||||
CharacterSet{{'{'}},
|
||||
CharacterSet{{'2'}},
|
||||
CharacterSet{{','}},
|
||||
CharacterSet{{'d'}},
|
||||
CharacterSet{{'}'}},
|
||||
}),
|
||||
}
|
||||
};
|
||||
|
||||
struct InvalidInputRow {
|
||||
|
|
@ -313,6 +383,11 @@ describe("parse_regex", []() {
|
|||
"a]",
|
||||
"unmatched close square bracket",
|
||||
},
|
||||
{
|
||||
"numbers out of order in range quantifiers",
|
||||
"a{3,1}",
|
||||
"numbers out of order in {} quantifier",
|
||||
},
|
||||
};
|
||||
|
||||
for (auto &row : valid_inputs) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue