From 5b630054c6999c134b3d2b2152b09424928efac4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 17 Feb 2021 17:22:33 -0800 Subject: [PATCH] Handle negated unicode property escapes in regexes Refs #380 --- cli/src/generate/prepare_grammar/expand_tokens.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index b948ddfc..5580eb72 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -14,7 +14,7 @@ use std::i32; lazy_static! { static ref CURLY_BRACE_REGEX: Regex = - Regex::new(r#"(^|[^\\p])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap(); + Regex::new(r#"(^|[^\\pP])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap(); static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec> = serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap(); static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec> = @@ -705,6 +705,18 @@ mod tests { (r#"'\'a"#, Some((2, r#"'\'"#))), ], }, + // unicode property escapes + Row { + rules: vec![ + Rule::pattern(r#"\p{L}+\P{L}+"#), + Rule::pattern(r#"\p{White_Space}+\P{White_Space}+\p{White_Space}*"#), + ], + separators: vec![], + examples: vec![ + (" 123 abc", Some((1, " 123 "))), + ("ბΨƁ___ƀƔ", Some((0, "ბΨƁ___"))), + ], + }, // allowing un-escaped curly braces Row { rules: vec![