From 2f28a35e1b118e17ab2fb6236a24c7b557e3c8a9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 18 Feb 2021 22:27:44 -0800 Subject: [PATCH] Handle unicode property escapes inside bracketed char classes Refs #906 --- cli/src/generate/prepare_grammar/expand_tokens.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 0329bd03..23866551 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -368,6 +368,13 @@ impl NfaBuilder { Ok(result) } ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)), + ClassSetItem::Unicode(class) => { + let mut set = self.expand_unicode_character_class(&class.kind)?; + if class.negated { + set = set.negate(); + } + Ok(set) + } _ => Err(Error::regex(format!( "Unsupported character class syntax {:?}", item @@ -709,7 +716,7 @@ mod tests { Row { rules: vec![ Rule::pattern(r#"\p{L}+\P{L}+"#), - Rule::pattern(r#"\p{White_Space}+\P{White_Space}+\p{White_Space}*"#), + Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#), ], separators: vec![], examples: vec![ @@ -717,6 +724,12 @@ mod tests { ("ბΨƁ___ƀƔ", Some((0, "ბΨƁ___"))), ], }, + // unicode property escapes in bracketed sets + Row { + rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#)], + separators: vec![], + examples: vec![("abΨ12٣٣, ok", Some((0, "abΨ12٣٣")))], + }, // unicode character escapes Row { rules: vec![