diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc index 4117af0a..8ab84fb2 100644 --- a/src/compiler/prepare_grammar/parse_regex.cc +++ b/src/compiler/prepare_grammar/parse_regex.cc @@ -1,6 +1,7 @@ #include "compiler/prepare_grammar/parse_regex.h" #include #include +#include #include #include "compiler/rule.h" #include "compiler/util/string_helpers.h" @@ -12,6 +13,7 @@ namespace prepare_grammar { using std::string; using std::vector; using std::pair; +using std::iswdigit; using rules::CharacterSet; using rules::Blank; using rules::Rule; @@ -85,6 +87,56 @@ class PatternParser { next(); result = Rule::choice({result, Blank{}}); break; + case '{': { + Checkpoint checkpoint = get_checkpoint(); + next(); + + string min_repeat_string; + while (iswdigit(peek())) { + min_repeat_string += (char)peek(); + next(); + } + + bool has_comma = false; + string max_repeat_string; + if (peek() == ',') { + next(); + has_comma = true; + while (iswdigit(peek())) { + max_repeat_string += (char)peek(); + next(); + } + } + + if (peek() == '}' && (!min_repeat_string.empty() || has_comma)) { + next(); + if (min_repeat_string.size()) { + unsigned min_count = std::stoi(min_repeat_string); + vector entries(min_count, result); + if (max_repeat_string.size()) { + unsigned max_count = std::stoi(max_repeat_string); + if (max_count < min_count) { + return error("numbers out of order in {} quantifier"); + } + vector optional_entries(max_count - min_count, Rule::choice({result, Blank{}})); + entries.insert(entries.end(), optional_entries.begin(), optional_entries.end()); + } else if (has_comma) { + entries.push_back(Rule::repeat(result)); + } + result = Rule::seq(entries); + } else if (max_repeat_string.size()) { + unsigned max_count = std::stoi(max_repeat_string); + vector optional_entries(max_count, Rule::choice({result, Blank{}})); + result = Rule::seq(optional_entries); + } else { + result = Rule::repeat(result); + } + } else { + revert(checkpoint); + } + + break; + } } } @@ -245,6 +297,20 @@ class PatternParser { iter += lookahead_size; } + struct Checkpoint { + const uint8_t *iter; + int32_t lookahead; + }; + + Checkpoint get_checkpoint() { + return Checkpoint{iter, lookahead}; + } + + void revert(Checkpoint checkpoint) { + iter = checkpoint.iter; + lookahead = checkpoint.lookahead; + } + uint32_t peek() { return lookahead; } diff --git a/test/compiler/prepare_grammar/parse_regex_test.cc b/test/compiler/prepare_grammar/parse_regex_test.cc index 8a3ab54e..fe189975 100644 --- a/test/compiler/prepare_grammar/parse_regex_test.cc +++ b/test/compiler/prepare_grammar/parse_regex_test.cc @@ -274,6 +274,76 @@ describe("parse_regex", []() { CharacterSet{{'/'}}, }), }, + + { + "characters with quantifiers", + "a{3}", + Rule::seq({ + CharacterSet{{'a'}}, + CharacterSet{{'a'}}, + CharacterSet{{'a'}}, + }), + }, + + { + "character classes with quantifiers", + "[a-f]{3}", + Rule::seq({ + CharacterSet().include('a', 'f'), + CharacterSet().include('a', 'f'), + CharacterSet().include('a', 'f'), + }), + }, + + { + "characters with open range quantifiers", + "a{,} b{1,} c{,2}", + Rule::seq({ + Rule::seq({ + Repeat{CharacterSet{{'a'}}}, + }), + CharacterSet{{' '}}, + Rule::seq({ + CharacterSet{{'b'}}, + Repeat{CharacterSet{{'b'}}}, + }), + CharacterSet{{' '}}, + Rule::seq({ + Rule::choice({CharacterSet{{'c'}}, Blank{}}), + Rule::choice({CharacterSet{{'c'}}, Blank{}}), + }), + }), + }, + + { + "characters with closed range quantifiers", + "a{2,4}", + Rule::seq({ + CharacterSet{{'a'}}, + CharacterSet{{'a'}}, + Rule::choice({CharacterSet{{'a'}}, Blank{}}), + Rule::choice({CharacterSet{{'a'}}, Blank{}}), + }), + }, + + { + "curly braces that aren't quantifiers", + "a{1b} c{2,d}", + Rule::seq({ + CharacterSet{{'a'}}, + CharacterSet{{'{'}}, + CharacterSet{{'1'}}, + CharacterSet{{'b'}}, + CharacterSet{{'}'}}, + CharacterSet{{' '}}, + CharacterSet{{'c'}}, + CharacterSet{{'{'}}, + CharacterSet{{'2'}}, + CharacterSet{{','}}, + CharacterSet{{'d'}}, + CharacterSet{{'}'}}, + }), + } }; struct InvalidInputRow { @@ -313,6 +383,11 @@ describe("parse_regex", []() { "a]", "unmatched close square bracket", }, + { + "numbers out of order in range quantifiers", + "a{3,1}", + "numbers out of order in {} quantifier", + }, }; for (auto &row : valid_inputs) {