From 8c845f29e02763c94b2df0e22ff7536925dda66f Mon Sep 17 00:00:00 2001 From: Ervin Oro Date: Tue, 9 Apr 2019 20:37:36 +0300 Subject: [PATCH 1/2] Allow hex characters in unicode code points --- cli/src/generate/prepare_grammar/expand_tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 9e2cf9fe..67813030 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -12,7 +12,7 @@ use std::i32; lazy_static! { static ref CURLY_BRACE_REGEX: Regex = - Regex::new(r#"(^|[^\\])\{([^}]*[^0-9,}][^}]*)\}"#).unwrap(); + Regex::new(r#"(^|[^\\])\{([^}]*[^0-9A-F,}][^}]*)\}"#).unwrap(); } const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; From e5584f82d3651d03de37164b0652a4e6390e682d Mon Sep 17 00:00:00 2001 From: Ervin Oro Date: Tue, 9 Apr 2019 21:55:49 +0300 Subject: [PATCH 2/2] Add test to verify regex unicode codepoints work --- cli/src/generate/prepare_grammar/expand_tokens.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 67813030..2f69fbdd 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -429,7 +429,7 @@ mod tests { .find(|t| t.characters.contains(c) && t.precedence >= result_precedence) { cursor.reset(states); - end_char += 1; + end_char += c.len_utf8(); if is_separator { start_char = end_char; } @@ -651,11 +651,14 @@ mod tests { Rule::pattern(r#"u{[0-9a-fA-F]+}"#), // Already-escaped curly braces Rule::pattern(r#"\{[ab]{3}\}"#), + // Unicode codepoints + Rule::pattern(r#"\u{1000A}"#), ], separators: vec![], examples: vec![ ("u{1234} ok", Some((0, "u{1234}"))), ("{aba}}", Some((1, "{aba}"))), + ("\u{1000A}", Some((2, "\u{1000A}"))), ], }, ];