Merge pull request #2474 from amaanq/case-insensitive

feat!: support the case-insensitive regex flag
This commit is contained in:
Amaan Qureshi 2023-08-11 23:12:15 -04:00 committed by GitHub
commit dee98e06e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 166 additions and 85 deletions

View file

@ -390,12 +390,12 @@ mod tests {
Variable {
name: "token_0".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[a-f]1|0x\\d"),
rule: Rule::pattern("[a-f]1|0x\\d", ""),
},
Variable {
name: "token_1".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("d*ef"),
rule: Rule::pattern("d*ef", ""),
},
],
})
@ -426,7 +426,7 @@ mod tests {
Variable {
name: "identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("\\w+"),
rule: Rule::pattern("\\w+", ""),
},
Variable {
name: "instanceof".to_string(),
@ -471,7 +471,7 @@ mod tests {
#[test]
fn test_token_conflicts_with_separators() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: vec![Rule::pattern("\\s")],
separators: vec![Rule::pattern("\\s", "")],
variables: vec![
Variable {
name: "x".to_string(),
@ -498,7 +498,7 @@ mod tests {
#[test]
fn test_token_conflicts_with_open_ended_tokens() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: vec![Rule::pattern("\\s")],
separators: vec![Rule::pattern("\\s", "")],
variables: vec![
Variable {
name: "x".to_string(),
@ -508,7 +508,7 @@ mod tests {
Variable {
name: "anything".to_string(),
kind: VariableType::Named,
rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*")),
rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*", "")),
},
],
})

View file

@ -183,7 +183,8 @@ function normalize(value) {
case RegExp:
return {
type: 'PATTERN',
value: value.source
value: value.source,
flags: value.flags
};
case ReferenceError:
throw value

View file

@ -1172,12 +1172,12 @@ mod tests {
Variable {
name: "identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("\\w+"),
rule: Rule::pattern("\\w+", ""),
},
Variable {
name: "foo_identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[\\w-]+"),
rule: Rule::pattern("[\\w-]+", ""),
},
],
..Default::default()
@ -1275,8 +1275,8 @@ mod tests {
name: "script".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::field("a".to_string(), Rule::pattern("hi")),
Rule::field("b".to_string(), Rule::pattern("bye")),
Rule::field("a".to_string(), Rule::pattern("hi", "")),
Rule::field("b".to_string(), Rule::pattern("bye", "")),
]),
}],
..Default::default()

View file

@ -19,6 +19,7 @@ enum RuleJSON {
},
PATTERN {
value: String,
flags: Option<String>,
},
SYMBOL {
name: String,
@ -143,7 +144,21 @@ fn parse_rule(json: RuleJSON) -> Rule {
} => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value } => Rule::Pattern(value),
RuleJSON::PATTERN { value, flags } => Rule::Pattern(
value,
flags.map_or(String::new(), |f| {
f.chars()
.filter(|c| {
if *c != 'i' {
eprintln!("Warning: unsupported flag {}", c);
false
} else {
true
}
})
.collect()
}),
),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::FIELD { content, name } => Rule::field(name, parse_rule(*content)),

View file

@ -139,10 +139,10 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
Rule::Pattern(s, f) => {
let s = preprocess_regex(s);
let ast = parse::Parser::new().parse(&s)?;
self.expand_regex(&ast, next_state_id)
self.expand_regex(&ast, next_state_id, f.contains('i'))
}
Rule::String(s) => {
for c in s.chars().rev() {
@ -210,12 +210,42 @@ impl NfaBuilder {
}
}
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
fn expand_regex(
&mut self,
ast: &Ast,
mut next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
fn inverse_char(c: char) -> char {
match c {
'a'..='z' => (c as u8 - b'a' + b'A') as char,
'A'..='Z' => (c as u8 - b'A' + b'a') as char,
c => c,
}
}
fn with_inverse_char(mut chars: CharacterSet) -> CharacterSet {
for char in chars.clone().chars() {
let inverted = inverse_char(char);
if char != inverted {
chars = chars.add_char(inverted);
}
}
chars
}
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(anyhow!("Regex error: Flags are not supported")),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
let mut char_set = CharacterSet::from_char(literal.c);
if case_insensitive {
let inverted = inverse_char(literal.c);
if literal.c != inverted {
char_set = char_set.add_char(inverted);
}
}
self.push_advance(char_set, next_state_id);
Ok(true)
}
Ast::Dot(_) => {
@ -229,6 +259,9 @@ impl NfaBuilder {
if class.negated {
chars = chars.negate();
}
if case_insensitive {
chars = with_inverse_char(chars);
}
self.push_advance(chars, next_state_id);
Ok(true)
}
@ -237,6 +270,9 @@ impl NfaBuilder {
if class.negated {
chars = chars.negate();
}
if case_insensitive {
chars = with_inverse_char(chars);
}
self.push_advance(chars, next_state_id);
Ok(true)
}
@ -245,48 +281,56 @@ impl NfaBuilder {
if class.negated {
chars = chars.negate();
}
if case_insensitive {
chars = with_inverse_char(chars);
}
self.push_advance(chars, next_state_id);
Ok(true)
}
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id)
self.expand_zero_or_one(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id)
self.expand_one_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id)
self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id)
self.expand_count(&repetition.ast, count, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
self.expand_count(&repetition.ast, min, next_state_id)
if self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)? {
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
let mut result =
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)?;
for _ in min..max {
if result {
next_state_id = self.nfa.last_state_id();
}
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
if self.expand_zero_or_one(
&repetition.ast,
next_state_id,
case_insensitive,
)? {
result = true;
}
}
Ok(result)
}
},
Ast::Group(group) => self.expand_regex(&group.ast, next_state_id),
Ast::Group(group) => self.expand_regex(&group.ast, next_state_id, case_insensitive),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if self.expand_regex(&ast, next_state_id)? {
if self.expand_regex(&ast, next_state_id, case_insensitive)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
@ -304,7 +348,7 @@ impl NfaBuilder {
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(&ast, next_state_id)? {
if self.expand_regex(&ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
@ -335,13 +379,18 @@ impl NfaBuilder {
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
fn expand_one_or_more(
&mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_regex(&ast, split_state_id)? {
if self.expand_regex(&ast, split_state_id, case_insensitive)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
@ -351,8 +400,13 @@ impl NfaBuilder {
}
}
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_regex(ast, next_state_id)? {
fn expand_zero_or_one(
&mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id);
Ok(true)
} else {
@ -360,8 +414,13 @@ impl NfaBuilder {
}
}
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id)? {
fn expand_zero_or_more(
&mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id);
Ok(true)
} else {
@ -369,10 +428,16 @@ impl NfaBuilder {
}
}
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
fn expand_count(
&mut self,
ast: &Ast,
count: u32,
mut next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if self.expand_regex(ast, next_state_id)? {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
@ -565,7 +630,7 @@ mod tests {
let table = [
// regex with sequences and alternatives
Row {
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?", "")],
separators: vec![],
examples: vec![
("ade1", Some((0, "ade"))),
@ -576,13 +641,13 @@ mod tests {
},
// regex with repeats
Row {
rules: vec![Rule::pattern("a*")],
rules: vec![Rule::pattern("a*", "")],
separators: vec![],
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
},
// regex with repeats in sequences
Row {
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
rules: vec![Rule::pattern("a((bc)+|(de)*)f", "")],
separators: vec![],
examples: vec![
("af1", Some((0, "af"))),
@ -593,13 +658,13 @@ mod tests {
},
// regex with character ranges
Row {
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
rules: vec![Rule::pattern("[a-fA-F0-9]+", "")],
separators: vec![],
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
},
// regex with perl character classes
Row {
rules: vec![Rule::pattern("\\w\\d\\s")],
rules: vec![Rule::pattern("\\w\\d\\s", "")],
separators: vec![],
examples: vec![("_0 ", Some((0, "_0 ")))],
},
@ -613,7 +678,7 @@ mod tests {
Row {
rules: vec![Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::pattern("[a-f]+", ""),
Rule::string("}"),
]))],
separators: vec![],
@ -626,9 +691,9 @@ mod tests {
// longest match rule
Row {
rules: vec![
Rule::pattern("a|bc"),
Rule::pattern("aa"),
Rule::pattern("bcd"),
Rule::pattern("a|bc", ""),
Rule::pattern("aa", ""),
Rule::pattern("bcd", ""),
],
separators: vec![],
examples: vec![
@ -642,7 +707,7 @@ mod tests {
},
// regex with an alternative including the empty string
Row {
rules: vec![Rule::pattern("a(b|)+c")],
rules: vec![Rule::pattern("a(b|)+c", "")],
separators: vec![],
examples: vec![
("ac.", Some((0, "ac"))),
@ -652,8 +717,8 @@ mod tests {
},
// separators
Row {
rules: vec![Rule::pattern("[a-f]+")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
rules: vec![Rule::pattern("[a-f]+", "")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s", "")],
examples: vec![
(" a", Some((0, "a"))),
(" \nb", Some((0, "b"))),
@ -664,11 +729,11 @@ mod tests {
// shorter tokens with higher precedence
Row {
rules: vec![
Rule::prec(Precedence::Integer(2), Rule::pattern("abc")),
Rule::prec(Precedence::Integer(1), Rule::pattern("ab[cd]e")),
Rule::pattern("[a-e]+"),
Rule::prec(Precedence::Integer(2), Rule::pattern("abc", "")),
Rule::prec(Precedence::Integer(1), Rule::pattern("ab[cd]e", "")),
Rule::pattern("[a-e]+", ""),
],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s", "")],
examples: vec![
("abceef", Some((0, "abc"))),
("abdeef", Some((1, "abde"))),
@ -678,13 +743,13 @@ mod tests {
// immediate tokens with higher precedence
Row {
rules: vec![
Rule::prec(Precedence::Integer(1), Rule::pattern("[^a]+")),
Rule::prec(Precedence::Integer(1), Rule::pattern("[^a]+", "")),
Rule::immediate_token(Rule::prec(
Precedence::Integer(2),
Rule::pattern("[^ab]+"),
Rule::pattern("[^ab]+", ""),
)),
],
separators: vec![Rule::pattern("\\s")],
separators: vec![Rule::pattern("\\s", "")],
examples: vec![("cccb", Some((1, "ccc")))],
},
Row {
@ -706,7 +771,7 @@ mod tests {
// nested choices within sequences
Row {
rules: vec![Rule::seq(vec![
Rule::pattern("[0-9]+"),
Rule::pattern("[0-9]+", ""),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::seq(vec![
@ -715,7 +780,7 @@ mod tests {
Rule::Blank,
Rule::choice(vec![Rule::string("+"), Rule::string("-")]),
]),
Rule::pattern("[0-9]+"),
Rule::pattern("[0-9]+", ""),
])]),
]),
])],
@ -732,7 +797,7 @@ mod tests {
},
// nested groups
Row {
rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#)])],
rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#, "")])],
separators: vec![],
examples: vec![("abcx", Some((0, "abc"))), ("abc\\0x", Some((0, "abc\\0")))],
},
@ -740,11 +805,11 @@ mod tests {
Row {
rules: vec![
// Escaped forward slash (used in JS because '/' is the regex delimiter)
Rule::pattern(r#"\/"#),
Rule::pattern(r#"\/"#, ""),
// Escaped quotes
Rule::pattern(r#"\"\'"#),
Rule::pattern(r#"\"\'"#, ""),
// Quote preceded by a literal backslash
Rule::pattern(r#"[\\']+"#),
Rule::pattern(r#"[\\']+"#, ""),
],
separators: vec![],
examples: vec![
@ -756,8 +821,8 @@ mod tests {
// unicode property escapes
Row {
rules: vec![
Rule::pattern(r#"\p{L}+\P{L}+"#),
Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#),
Rule::pattern(r#"\p{L}+\P{L}+"#, ""),
Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#, ""),
],
separators: vec![],
examples: vec![
@ -767,17 +832,17 @@ mod tests {
},
// unicode property escapes in bracketed sets
Row {
rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#)],
rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#, "")],
separators: vec![],
examples: vec![("abΨ12٣٣, ok", Some((0, "abΨ12٣٣")))],
},
// unicode character escapes
Row {
rules: vec![
Rule::pattern(r#"\u{00dc}"#),
Rule::pattern(r#"\U{000000dd}"#),
Rule::pattern(r#"\u00de"#),
Rule::pattern(r#"\U000000df"#),
Rule::pattern(r#"\u{00dc}"#, ""),
Rule::pattern(r#"\U{000000dd}"#, ""),
Rule::pattern(r#"\u00de"#, ""),
Rule::pattern(r#"\U000000df"#, ""),
],
separators: vec![],
examples: vec![
@ -791,13 +856,13 @@ mod tests {
Row {
rules: vec![
// Un-escaped curly braces
Rule::pattern(r#"u{[0-9a-fA-F]+}"#),
Rule::pattern(r#"u{[0-9a-fA-F]+}"#, ""),
// Already-escaped curly braces
Rule::pattern(r#"\{[ab]{3}\}"#),
Rule::pattern(r#"\{[ab]{3}\}"#, ""),
// Unicode codepoints
Rule::pattern(r#"\u{1000A}"#),
Rule::pattern(r#"\u{1000A}"#, ""),
// Unicode codepoints (lowercase)
Rule::pattern(r#"\u{1000b}"#),
Rule::pattern(r#"\u{1000b}"#, ""),
],
separators: vec![],
examples: vec![
@ -809,7 +874,7 @@ mod tests {
},
// Emojis
Row {
rules: vec![Rule::pattern(r"\p{Emoji}+")],
rules: vec![Rule::pattern(r"\p{Emoji}+", "")],
separators: vec![],
examples: vec![
("🐎", Some((0, "🐎"))),
@ -822,7 +887,7 @@ mod tests {
},
// Intersection
Row {
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+", "")],
separators: vec![],
examples: vec![
("456", Some((0, "456"))),
@ -835,7 +900,7 @@ mod tests {
},
// Difference
Row {
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+", "")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
@ -848,7 +913,7 @@ mod tests {
},
// Symmetric difference
Row {
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+", "")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
@ -869,7 +934,7 @@ mod tests {
// [6-7]: y y
// [3-9]--[5-7]: y y y y y
// final regex: y y y y y y
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+", "")],
separators: vec![],
examples: vec![
("01", Some((0, "01"))),

View file

@ -320,7 +320,7 @@ mod test {
"rule_0",
Rule::repeat(Rule::seq(vec![
Rule::string("a"),
Rule::pattern("b"),
Rule::pattern("b", ""),
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
@ -331,8 +331,8 @@ mod test {
]),
])),
),
Variable::named("rule_1", Rule::pattern("e")),
Variable::named("rule_2", Rule::pattern("b")),
Variable::named("rule_1", Rule::pattern("e", "")),
Variable::named("rule_2", Rule::pattern("b", "")),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]),
@ -378,12 +378,12 @@ mod test {
lexical_grammar.variables,
vec![
Variable::anonymous("a", Rule::string("a")),
Variable::auxiliary("rule_0_token1", Rule::pattern("b")),
Variable::auxiliary("rule_0_token1", Rule::pattern("b", "")),
Variable::auxiliary(
"rule_0_token2",
Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),]))
),
Variable::named("rule_1", Rule::pattern("e")),
Variable::named("rule_1", Rule::pattern("e", "")),
]
);
}
@ -411,7 +411,7 @@ mod test {
fn test_extracting_extra_symbols() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::string("x")),
Variable::named("comment", Rule::pattern("//.*")),
Variable::named("comment", Rule::pattern("//.*", "")),
]);
grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];

View file

@ -56,7 +56,7 @@ pub(crate) struct Symbol {
pub(crate) enum Rule {
Blank,
String(String),
Pattern(String),
Pattern(String, String),
NamedSymbol(String),
Symbol(Symbol),
Choice(Vec<Rule>),
@ -187,8 +187,8 @@ impl Rule {
Rule::String(value.to_string())
}
pub fn pattern(value: &'static str) -> Self {
Rule::Pattern(value.to_string())
pub fn pattern(value: &'static str, flags: &'static str) -> Self {
Rule::Pattern(value.to_string(), flags.to_string())
}
}