Start work on handling unicode property escapes in regexes

2021-01-29 16:37:45 -08:00 · 2021-01-29 16:37:45 -08:00 · e3ba701344
commit e3ba701344
parent 38444ea7f9
7 changed files with 246 additions and 10 deletions
--- a/cli/src/error.rs
+++ b/cli/src/error.rs
@ -13,8 +13,9 @@ impl Error {
        Error(vec![format!("Grammar error: {}", message)])
    }

-    pub fn regex(message: &str) -> Self {
-        Error(vec![format!("Regex error: {}", message)])
+    pub fn regex(mut message: String) -> Self {
+        message.insert_str(0, "Regex error: ");
+        Error(vec![message])
    }

    pub fn undefined_symbol(name: &str) -> Self {
--- a/cli/src/generate/prepare_grammar/expand_tokens.rs
+++ b/cli/src/generate/prepare_grammar/expand_tokens.rs
@ -6,15 +6,23 @@ use crate::generate::rules::Rule;
 use lazy_static::lazy_static;
 use regex::Regex;
 use regex_syntax::ast::{
-    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
+    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
+    RepetitionRange,
 };
+use std::collections::HashMap;
 use std::i32;

 lazy_static! {
    static ref CURLY_BRACE_REGEX: Regex =
-        Regex::new(r#"(^|[^\\])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
+        Regex::new(r#"(^|[^\\p])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
+    static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
+        serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
+    static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
+        serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
 }

+const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
+const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
 const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];

 struct NfaBuilder {
@ -196,7 +204,7 @@ impl NfaBuilder {
    fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
        match ast {
            Ast::Empty(_) => Ok(false),
-            Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
+            Ast::Flags(_) => Err(Error::regex("Flags are not supported".to_string())),
            Ast::Literal(literal) => {
                self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
                Ok(true)
@ -205,10 +213,15 @@ impl NfaBuilder {
                self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
                Ok(true)
            }
-            Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
+            Ast::Assertion(_) => Err(Error::regex("Assertions are not supported".to_string())),
            Ast::Class(class) => match class {
-                Class::Unicode(_) => {
-                    Err(Error::regex("Unicode character classes are not supported"))
+                Class::Unicode(class) => {
+                    let mut chars = self.expand_unicode_character_class(&class.kind)?;
+                    if class.negated {
+                        chars = chars.negate();
+                    }
+                    self.push_advance(chars, next_state_id);
+                    Ok(true)
                }
                Class::Perl(class) => {
                    let mut chars = self.expand_perl_character_class(&class.kind);
@ -228,7 +241,7 @@ impl NfaBuilder {
                        Ok(true)
                    }
                    ClassSet::BinaryOp(_) => Err(Error::regex(
-                        "Binary operators in character classes aren't supported",
+                        "Binary operators in character classes aren't supported".to_string(),
                    )),
                },
            },
@ -355,13 +368,63 @@ impl NfaBuilder {
                Ok(result)
            }
            ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
-            _ => Err(Error::regex(&format!(
+            _ => Err(Error::regex(format!(
                "Unsupported character class syntax {:?}",
                item
            ))),
        }
    }

+    fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
+        let mut chars = CharacterSet::empty();
+
+        let category_letter;
+        match class {
+            ClassUnicodeKind::OneLetter(le) => {
+                category_letter = le.to_string();
+            }
+            ClassUnicodeKind::Named(class_name) => {
+                if class_name.len() == 1 {
+                    category_letter = class_name.clone();
+                } else {
+                    let code_points = UNICODE_CATEGORIES
+                        .get(class_name.as_str())
+                        .or_else(|| UNICODE_PROPERTIES.get(class_name.as_str()))
+                        .ok_or_else(|| {
+                            Error::regex(format!(
+                                "Unsupported unicode character class {}",
+                                class_name
+                            ))
+                        })?;
+                    for c in code_points {
+                        if let Some(c) = std::char::from_u32(*c) {
+                            chars = chars.add_char(c);
+                        }
+                    }
+
+                    return Ok(chars);
+                }
+            }
+            ClassUnicodeKind::NamedValue { .. } => {
+                return Err(Error::regex(
+                    "Key-value unicode properties are not supported".to_string(),
+                ))
+            }
+        }
+
+        for (category, code_points) in UNICODE_CATEGORIES.iter() {
+            if category.starts_with(&category_letter) {
+                for c in code_points {
+                    if let Some(c) = std::char::from_u32(*c) {
+                        chars = chars.add_char(c);
+                    }
+                }
+            }
+        }
+
+        Ok(chars)
+    }
+
    fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
        match item {
            ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
--- a/cli/src/generate/prepare_grammar/unicode-categories.json
+++ b/cli/src/generate/prepare_grammar/unicode-categories.json
--- a/cli/src/generate/prepare_grammar/unicode-properties.json
+++ b/cli/src/generate/prepare_grammar/unicode-properties.json