Merge pull request #2474 from amaanq/case-insensitive
feat!: support the case-insensitive regex flag
This commit is contained in:
commit
dee98e06e9
7 changed files with 166 additions and 85 deletions
|
|
@ -390,12 +390,12 @@ mod tests {
|
|||
Variable {
|
||||
name: "token_0".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("[a-f]1|0x\\d"),
|
||||
rule: Rule::pattern("[a-f]1|0x\\d", ""),
|
||||
},
|
||||
Variable {
|
||||
name: "token_1".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("d*ef"),
|
||||
rule: Rule::pattern("d*ef", ""),
|
||||
},
|
||||
],
|
||||
})
|
||||
|
|
@ -426,7 +426,7 @@ mod tests {
|
|||
Variable {
|
||||
name: "identifier".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("\\w+"),
|
||||
rule: Rule::pattern("\\w+", ""),
|
||||
},
|
||||
Variable {
|
||||
name: "instanceof".to_string(),
|
||||
|
|
@ -471,7 +471,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_token_conflicts_with_separators() {
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: vec![Rule::pattern("\\s")],
|
||||
separators: vec![Rule::pattern("\\s", "")],
|
||||
variables: vec![
|
||||
Variable {
|
||||
name: "x".to_string(),
|
||||
|
|
@ -498,7 +498,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_token_conflicts_with_open_ended_tokens() {
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: vec![Rule::pattern("\\s")],
|
||||
separators: vec![Rule::pattern("\\s", "")],
|
||||
variables: vec![
|
||||
Variable {
|
||||
name: "x".to_string(),
|
||||
|
|
@ -508,7 +508,7 @@ mod tests {
|
|||
Variable {
|
||||
name: "anything".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*")),
|
||||
rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*", "")),
|
||||
},
|
||||
],
|
||||
})
|
||||
|
|
|
|||
|
|
@ -183,7 +183,8 @@ function normalize(value) {
|
|||
case RegExp:
|
||||
return {
|
||||
type: 'PATTERN',
|
||||
value: value.source
|
||||
value: value.source,
|
||||
flags: value.flags
|
||||
};
|
||||
case ReferenceError:
|
||||
throw value
|
||||
|
|
|
|||
|
|
@ -1172,12 +1172,12 @@ mod tests {
|
|||
Variable {
|
||||
name: "identifier".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("\\w+"),
|
||||
rule: Rule::pattern("\\w+", ""),
|
||||
},
|
||||
Variable {
|
||||
name: "foo_identifier".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("[\\w-]+"),
|
||||
rule: Rule::pattern("[\\w-]+", ""),
|
||||
},
|
||||
],
|
||||
..Default::default()
|
||||
|
|
@ -1275,8 +1275,8 @@ mod tests {
|
|||
name: "script".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::field("a".to_string(), Rule::pattern("hi")),
|
||||
Rule::field("b".to_string(), Rule::pattern("bye")),
|
||||
Rule::field("a".to_string(), Rule::pattern("hi", "")),
|
||||
Rule::field("b".to_string(), Rule::pattern("bye", "")),
|
||||
]),
|
||||
}],
|
||||
..Default::default()
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ enum RuleJSON {
|
|||
},
|
||||
PATTERN {
|
||||
value: String,
|
||||
flags: Option<String>,
|
||||
},
|
||||
SYMBOL {
|
||||
name: String,
|
||||
|
|
@ -143,7 +144,21 @@ fn parse_rule(json: RuleJSON) -> Rule {
|
|||
} => Rule::alias(parse_rule(*content), value, named),
|
||||
RuleJSON::BLANK => Rule::Blank,
|
||||
RuleJSON::STRING { value } => Rule::String(value),
|
||||
RuleJSON::PATTERN { value } => Rule::Pattern(value),
|
||||
RuleJSON::PATTERN { value, flags } => Rule::Pattern(
|
||||
value,
|
||||
flags.map_or(String::new(), |f| {
|
||||
f.chars()
|
||||
.filter(|c| {
|
||||
if *c != 'i' {
|
||||
eprintln!("Warning: unsupported flag {}", c);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}),
|
||||
),
|
||||
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
|
||||
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
|
||||
RuleJSON::FIELD { content, name } => Rule::field(name, parse_rule(*content)),
|
||||
|
|
|
|||
|
|
@ -139,10 +139,10 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
|
|||
impl NfaBuilder {
|
||||
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
Rule::Pattern(s, f) => {
|
||||
let s = preprocess_regex(s);
|
||||
let ast = parse::Parser::new().parse(&s)?;
|
||||
self.expand_regex(&ast, next_state_id)
|
||||
self.expand_regex(&ast, next_state_id, f.contains('i'))
|
||||
}
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
|
|
@ -210,12 +210,42 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
|
||||
fn expand_regex(
|
||||
&mut self,
|
||||
ast: &Ast,
|
||||
mut next_state_id: u32,
|
||||
case_insensitive: bool,
|
||||
) -> Result<bool> {
|
||||
fn inverse_char(c: char) -> char {
|
||||
match c {
|
||||
'a'..='z' => (c as u8 - b'a' + b'A') as char,
|
||||
'A'..='Z' => (c as u8 - b'A' + b'a') as char,
|
||||
c => c,
|
||||
}
|
||||
}
|
||||
|
||||
fn with_inverse_char(mut chars: CharacterSet) -> CharacterSet {
|
||||
for char in chars.clone().chars() {
|
||||
let inverted = inverse_char(char);
|
||||
if char != inverted {
|
||||
chars = chars.add_char(inverted);
|
||||
}
|
||||
}
|
||||
chars
|
||||
}
|
||||
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(false),
|
||||
Ast::Flags(_) => Err(anyhow!("Regex error: Flags are not supported")),
|
||||
Ast::Literal(literal) => {
|
||||
self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
|
||||
let mut char_set = CharacterSet::from_char(literal.c);
|
||||
if case_insensitive {
|
||||
let inverted = inverse_char(literal.c);
|
||||
if literal.c != inverted {
|
||||
char_set = char_set.add_char(inverted);
|
||||
}
|
||||
}
|
||||
self.push_advance(char_set, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Dot(_) => {
|
||||
|
|
@ -229,6 +259,9 @@ impl NfaBuilder {
|
|||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
if case_insensitive {
|
||||
chars = with_inverse_char(chars);
|
||||
}
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
|
|
@ -237,6 +270,9 @@ impl NfaBuilder {
|
|||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
if case_insensitive {
|
||||
chars = with_inverse_char(chars);
|
||||
}
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
|
|
@ -245,48 +281,56 @@ impl NfaBuilder {
|
|||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
if case_insensitive {
|
||||
chars = with_inverse_char(chars);
|
||||
}
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
self.expand_zero_or_one(&repetition.ast, next_state_id)
|
||||
self.expand_zero_or_one(&repetition.ast, next_state_id, case_insensitive)
|
||||
}
|
||||
RepetitionKind::OneOrMore => {
|
||||
self.expand_one_or_more(&repetition.ast, next_state_id)
|
||||
self.expand_one_or_more(&repetition.ast, next_state_id, case_insensitive)
|
||||
}
|
||||
RepetitionKind::ZeroOrMore => {
|
||||
self.expand_zero_or_more(&repetition.ast, next_state_id)
|
||||
self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
|
||||
self.expand_count(&repetition.ast, count, next_state_id)
|
||||
self.expand_count(&repetition.ast, count, next_state_id, case_insensitive)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
|
||||
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
|
||||
self.expand_count(&repetition.ast, min, next_state_id)
|
||||
if self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)? {
|
||||
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
|
||||
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
|
||||
let mut result =
|
||||
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)?;
|
||||
for _ in min..max {
|
||||
if result {
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
|
||||
if self.expand_zero_or_one(
|
||||
&repetition.ast,
|
||||
next_state_id,
|
||||
case_insensitive,
|
||||
)? {
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
},
|
||||
Ast::Group(group) => self.expand_regex(&group.ast, next_state_id),
|
||||
Ast::Group(group) => self.expand_regex(&group.ast, next_state_id, case_insensitive),
|
||||
Ast::Alternation(alternation) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for ast in alternation.asts.iter() {
|
||||
if self.expand_regex(&ast, next_state_id)? {
|
||||
if self.expand_regex(&ast, next_state_id, case_insensitive)? {
|
||||
alternative_state_ids.push(self.nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
|
|
@ -304,7 +348,7 @@ impl NfaBuilder {
|
|||
Ast::Concat(concat) => {
|
||||
let mut result = false;
|
||||
for ast in concat.asts.iter().rev() {
|
||||
if self.expand_regex(&ast, next_state_id)? {
|
||||
if self.expand_regex(&ast, next_state_id, case_insensitive)? {
|
||||
result = true;
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
|
|
@ -335,13 +379,18 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
fn expand_one_or_more(
|
||||
&mut self,
|
||||
ast: &Ast,
|
||||
next_state_id: u32,
|
||||
case_insensitive: bool,
|
||||
) -> Result<bool> {
|
||||
self.nfa.states.push(NfaState::Accept {
|
||||
variable_index: 0,
|
||||
precedence: 0,
|
||||
}); // Placeholder for split
|
||||
let split_state_id = self.nfa.last_state_id();
|
||||
if self.expand_regex(&ast, split_state_id)? {
|
||||
if self.expand_regex(&ast, split_state_id, case_insensitive)? {
|
||||
self.nfa.states[split_state_id as usize] =
|
||||
NfaState::Split(self.nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
|
|
@ -351,8 +400,13 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
if self.expand_regex(ast, next_state_id)? {
|
||||
fn expand_zero_or_one(
|
||||
&mut self,
|
||||
ast: &Ast,
|
||||
next_state_id: u32,
|
||||
case_insensitive: bool,
|
||||
) -> Result<bool> {
|
||||
if self.expand_regex(ast, next_state_id, case_insensitive)? {
|
||||
self.push_split(next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
|
|
@ -360,8 +414,13 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
if self.expand_one_or_more(&ast, next_state_id)? {
|
||||
fn expand_zero_or_more(
|
||||
&mut self,
|
||||
ast: &Ast,
|
||||
next_state_id: u32,
|
||||
case_insensitive: bool,
|
||||
) -> Result<bool> {
|
||||
if self.expand_one_or_more(&ast, next_state_id, case_insensitive)? {
|
||||
self.push_split(next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
|
|
@ -369,10 +428,16 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
|
||||
fn expand_count(
|
||||
&mut self,
|
||||
ast: &Ast,
|
||||
count: u32,
|
||||
mut next_state_id: u32,
|
||||
case_insensitive: bool,
|
||||
) -> Result<bool> {
|
||||
let mut result = false;
|
||||
for _ in 0..count {
|
||||
if self.expand_regex(ast, next_state_id)? {
|
||||
if self.expand_regex(ast, next_state_id, case_insensitive)? {
|
||||
result = true;
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
|
|
@ -565,7 +630,7 @@ mod tests {
|
|||
let table = [
|
||||
// regex with sequences and alternatives
|
||||
Row {
|
||||
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
|
||||
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("ade1", Some((0, "ade"))),
|
||||
|
|
@ -576,13 +641,13 @@ mod tests {
|
|||
},
|
||||
// regex with repeats
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a*")],
|
||||
rules: vec![Rule::pattern("a*", "")],
|
||||
separators: vec![],
|
||||
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
|
||||
},
|
||||
// regex with repeats in sequences
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
|
||||
rules: vec![Rule::pattern("a((bc)+|(de)*)f", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("af1", Some((0, "af"))),
|
||||
|
|
@ -593,13 +658,13 @@ mod tests {
|
|||
},
|
||||
// regex with character ranges
|
||||
Row {
|
||||
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
|
||||
rules: vec![Rule::pattern("[a-fA-F0-9]+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
|
||||
},
|
||||
// regex with perl character classes
|
||||
Row {
|
||||
rules: vec![Rule::pattern("\\w\\d\\s")],
|
||||
rules: vec![Rule::pattern("\\w\\d\\s", "")],
|
||||
separators: vec![],
|
||||
examples: vec![("_0 ", Some((0, "_0 ")))],
|
||||
},
|
||||
|
|
@ -613,7 +678,7 @@ mod tests {
|
|||
Row {
|
||||
rules: vec![Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("{"),
|
||||
Rule::pattern("[a-f]+"),
|
||||
Rule::pattern("[a-f]+", ""),
|
||||
Rule::string("}"),
|
||||
]))],
|
||||
separators: vec![],
|
||||
|
|
@ -626,9 +691,9 @@ mod tests {
|
|||
// longest match rule
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::pattern("a|bc"),
|
||||
Rule::pattern("aa"),
|
||||
Rule::pattern("bcd"),
|
||||
Rule::pattern("a|bc", ""),
|
||||
Rule::pattern("aa", ""),
|
||||
Rule::pattern("bcd", ""),
|
||||
],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
|
|
@ -642,7 +707,7 @@ mod tests {
|
|||
},
|
||||
// regex with an alternative including the empty string
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a(b|)+c")],
|
||||
rules: vec![Rule::pattern("a(b|)+c", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("ac.", Some((0, "ac"))),
|
||||
|
|
@ -652,8 +717,8 @@ mod tests {
|
|||
},
|
||||
// separators
|
||||
Row {
|
||||
rules: vec![Rule::pattern("[a-f]+")],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
|
||||
rules: vec![Rule::pattern("[a-f]+", "")],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s", "")],
|
||||
examples: vec![
|
||||
(" a", Some((0, "a"))),
|
||||
(" \nb", Some((0, "b"))),
|
||||
|
|
@ -664,11 +729,11 @@ mod tests {
|
|||
// shorter tokens with higher precedence
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::prec(Precedence::Integer(2), Rule::pattern("abc")),
|
||||
Rule::prec(Precedence::Integer(1), Rule::pattern("ab[cd]e")),
|
||||
Rule::pattern("[a-e]+"),
|
||||
Rule::prec(Precedence::Integer(2), Rule::pattern("abc", "")),
|
||||
Rule::prec(Precedence::Integer(1), Rule::pattern("ab[cd]e", "")),
|
||||
Rule::pattern("[a-e]+", ""),
|
||||
],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s", "")],
|
||||
examples: vec![
|
||||
("abceef", Some((0, "abc"))),
|
||||
("abdeef", Some((1, "abde"))),
|
||||
|
|
@ -678,13 +743,13 @@ mod tests {
|
|||
// immediate tokens with higher precedence
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::prec(Precedence::Integer(1), Rule::pattern("[^a]+")),
|
||||
Rule::prec(Precedence::Integer(1), Rule::pattern("[^a]+", "")),
|
||||
Rule::immediate_token(Rule::prec(
|
||||
Precedence::Integer(2),
|
||||
Rule::pattern("[^ab]+"),
|
||||
Rule::pattern("[^ab]+", ""),
|
||||
)),
|
||||
],
|
||||
separators: vec![Rule::pattern("\\s")],
|
||||
separators: vec![Rule::pattern("\\s", "")],
|
||||
examples: vec![("cccb", Some((1, "ccc")))],
|
||||
},
|
||||
Row {
|
||||
|
|
@ -706,7 +771,7 @@ mod tests {
|
|||
// nested choices within sequences
|
||||
Row {
|
||||
rules: vec![Rule::seq(vec![
|
||||
Rule::pattern("[0-9]+"),
|
||||
Rule::pattern("[0-9]+", ""),
|
||||
Rule::choice(vec![
|
||||
Rule::Blank,
|
||||
Rule::choice(vec![Rule::seq(vec![
|
||||
|
|
@ -715,7 +780,7 @@ mod tests {
|
|||
Rule::Blank,
|
||||
Rule::choice(vec![Rule::string("+"), Rule::string("-")]),
|
||||
]),
|
||||
Rule::pattern("[0-9]+"),
|
||||
Rule::pattern("[0-9]+", ""),
|
||||
])]),
|
||||
]),
|
||||
])],
|
||||
|
|
@ -732,7 +797,7 @@ mod tests {
|
|||
},
|
||||
// nested groups
|
||||
Row {
|
||||
rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#)])],
|
||||
rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#, "")])],
|
||||
separators: vec![],
|
||||
examples: vec![("abcx", Some((0, "abc"))), ("abc\\0x", Some((0, "abc\\0")))],
|
||||
},
|
||||
|
|
@ -740,11 +805,11 @@ mod tests {
|
|||
Row {
|
||||
rules: vec![
|
||||
// Escaped forward slash (used in JS because '/' is the regex delimiter)
|
||||
Rule::pattern(r#"\/"#),
|
||||
Rule::pattern(r#"\/"#, ""),
|
||||
// Escaped quotes
|
||||
Rule::pattern(r#"\"\'"#),
|
||||
Rule::pattern(r#"\"\'"#, ""),
|
||||
// Quote preceded by a literal backslash
|
||||
Rule::pattern(r#"[\\']+"#),
|
||||
Rule::pattern(r#"[\\']+"#, ""),
|
||||
],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
|
|
@ -756,8 +821,8 @@ mod tests {
|
|||
// unicode property escapes
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::pattern(r#"\p{L}+\P{L}+"#),
|
||||
Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#),
|
||||
Rule::pattern(r#"\p{L}+\P{L}+"#, ""),
|
||||
Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#, ""),
|
||||
],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
|
|
@ -767,17 +832,17 @@ mod tests {
|
|||
},
|
||||
// unicode property escapes in bracketed sets
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#)],
|
||||
rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#, "")],
|
||||
separators: vec![],
|
||||
examples: vec![("abΨ12٣٣, ok", Some((0, "abΨ12٣٣")))],
|
||||
},
|
||||
// unicode character escapes
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::pattern(r#"\u{00dc}"#),
|
||||
Rule::pattern(r#"\U{000000dd}"#),
|
||||
Rule::pattern(r#"\u00de"#),
|
||||
Rule::pattern(r#"\U000000df"#),
|
||||
Rule::pattern(r#"\u{00dc}"#, ""),
|
||||
Rule::pattern(r#"\U{000000dd}"#, ""),
|
||||
Rule::pattern(r#"\u00de"#, ""),
|
||||
Rule::pattern(r#"\U000000df"#, ""),
|
||||
],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
|
|
@ -791,13 +856,13 @@ mod tests {
|
|||
Row {
|
||||
rules: vec![
|
||||
// Un-escaped curly braces
|
||||
Rule::pattern(r#"u{[0-9a-fA-F]+}"#),
|
||||
Rule::pattern(r#"u{[0-9a-fA-F]+}"#, ""),
|
||||
// Already-escaped curly braces
|
||||
Rule::pattern(r#"\{[ab]{3}\}"#),
|
||||
Rule::pattern(r#"\{[ab]{3}\}"#, ""),
|
||||
// Unicode codepoints
|
||||
Rule::pattern(r#"\u{1000A}"#),
|
||||
Rule::pattern(r#"\u{1000A}"#, ""),
|
||||
// Unicode codepoints (lowercase)
|
||||
Rule::pattern(r#"\u{1000b}"#),
|
||||
Rule::pattern(r#"\u{1000b}"#, ""),
|
||||
],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
|
|
@ -809,7 +874,7 @@ mod tests {
|
|||
},
|
||||
// Emojis
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"\p{Emoji}+")],
|
||||
rules: vec![Rule::pattern(r"\p{Emoji}+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("🐎", Some((0, "🐎"))),
|
||||
|
|
@ -822,7 +887,7 @@ mod tests {
|
|||
},
|
||||
// Intersection
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
|
||||
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("456", Some((0, "456"))),
|
||||
|
|
@ -835,7 +900,7 @@ mod tests {
|
|||
},
|
||||
// Difference
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
|
||||
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("123", Some((0, "123"))),
|
||||
|
|
@ -848,7 +913,7 @@ mod tests {
|
|||
},
|
||||
// Symmetric difference
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
|
||||
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("123", Some((0, "123"))),
|
||||
|
|
@ -869,7 +934,7 @@ mod tests {
|
|||
// [6-7]: y y
|
||||
// [3-9]--[5-7]: y y y y y
|
||||
// final regex: y y y y y y
|
||||
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
|
||||
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+", "")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("01", Some((0, "01"))),
|
||||
|
|
|
|||
|
|
@ -320,7 +320,7 @@ mod test {
|
|||
"rule_0",
|
||||
Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("a"),
|
||||
Rule::pattern("b"),
|
||||
Rule::pattern("b", ""),
|
||||
Rule::choice(vec![
|
||||
Rule::non_terminal(1),
|
||||
Rule::non_terminal(2),
|
||||
|
|
@ -331,8 +331,8 @@ mod test {
|
|||
]),
|
||||
])),
|
||||
),
|
||||
Variable::named("rule_1", Rule::pattern("e")),
|
||||
Variable::named("rule_2", Rule::pattern("b")),
|
||||
Variable::named("rule_1", Rule::pattern("e", "")),
|
||||
Variable::named("rule_2", Rule::pattern("b", "")),
|
||||
Variable::named(
|
||||
"rule_3",
|
||||
Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]),
|
||||
|
|
@ -378,12 +378,12 @@ mod test {
|
|||
lexical_grammar.variables,
|
||||
vec![
|
||||
Variable::anonymous("a", Rule::string("a")),
|
||||
Variable::auxiliary("rule_0_token1", Rule::pattern("b")),
|
||||
Variable::auxiliary("rule_0_token1", Rule::pattern("b", "")),
|
||||
Variable::auxiliary(
|
||||
"rule_0_token2",
|
||||
Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),]))
|
||||
),
|
||||
Variable::named("rule_1", Rule::pattern("e")),
|
||||
Variable::named("rule_1", Rule::pattern("e", "")),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
|
@ -411,7 +411,7 @@ mod test {
|
|||
fn test_extracting_extra_symbols() {
|
||||
let mut grammar = build_grammar(vec![
|
||||
Variable::named("rule_0", Rule::string("x")),
|
||||
Variable::named("comment", Rule::pattern("//.*")),
|
||||
Variable::named("comment", Rule::pattern("//.*", "")),
|
||||
]);
|
||||
grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ pub(crate) struct Symbol {
|
|||
pub(crate) enum Rule {
|
||||
Blank,
|
||||
String(String),
|
||||
Pattern(String),
|
||||
Pattern(String, String),
|
||||
NamedSymbol(String),
|
||||
Symbol(Symbol),
|
||||
Choice(Vec<Rule>),
|
||||
|
|
@ -187,8 +187,8 @@ impl Rule {
|
|||
Rule::String(value.to_string())
|
||||
}
|
||||
|
||||
pub fn pattern(value: &'static str) -> Self {
|
||||
Rule::Pattern(value.to_string())
|
||||
pub fn pattern(value: &'static str, flags: &'static str) -> Self {
|
||||
Rule::Pattern(value.to_string(), flags.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue