Preprocess regexes to allow non-standard escape sequences

Also allow unescaped curly braces to match literal curly braces when they don't form a valid repetition operator.
2019-01-14 14:05:19 -08:00 · 2019-01-14 14:05:19 -08:00 · e2717a6ad1
commit e2717a6ad1
parent 2e009f7177
2 changed files with 65 additions and 5 deletions
--- a/cli/src/generate/dsl.js
+++ b/cli/src/generate/dsl.js
@ -1,5 +1,4 @@
 const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi;
-const DELIMITER_ESCAPE_PATTERN = /\\\//g;

 function alias(rule, value) {
  const result = {
@ -150,10 +149,6 @@ function normalize(value) {
      return {
          type: 'PATTERN',
          value: value.source
-            .replace(
-              DELIMITER_ESCAPE_PATTERN,
-              '/'
-            )
            .replace(
              UNICODE_ESCAPE_PATTERN,
              (match, group) => String.fromCharCode(parseInt(group, 16))
--- a/cli/src/generate/prepare_grammar/expand_tokens.rs
+++ b/cli/src/generate/prepare_grammar/expand_tokens.rs
@ -6,8 +6,15 @@ use crate::generate::rules::Rule;
 use regex_syntax::ast::{
    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
 };
+use regex::Regex;
 use std::i32;

+lazy_static! {
+    static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9}][^}]*)\}"#).unwrap();
+}
+
+const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
+
 struct NfaBuilder {
    nfa: Nfa,
    is_sep: bool,
@ -35,6 +42,31 @@ fn get_completion_precedence(rule: &Rule) -> i32 {
    }
 }

+fn preprocess_regex(content: &str) -> String {
+    let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}");
+    let mut result = String::with_capacity(content.len());
+    let mut is_escaped = false;
+    for c in content.chars() {
+        if is_escaped {
+            if ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) {
+                result.push(c);
+            } else {
+                result.push('\\');
+                result.push(c);
+            }
+            is_escaped = false;
+        } else if c == '\\' {
+            is_escaped = true;
+        } else {
+            result.push(c);
+        }
+    }
+    if is_escaped {
+        result.push('\\');
+    }
+    result
+}
+
 pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
    let mut builder = NfaBuilder {
        nfa: Nfa::new(),
@ -90,6 +122,7 @@ impl NfaBuilder {
    fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
        match rule {
            Rule::Pattern(s) => {
+                let s = preprocess_regex(s);
                let ast = parse::Parser::new()
                    .parse(&s)
                    .map_err(|e| Error(e.to_string()))?;
@ -586,6 +619,38 @@ mod tests {
                    ("12e34", Some((0, "12e34"))),
                ],
            },
+            // Allowing unrecognized escape sequences
+            Row {
+                rules: vec![
+                    // Escaped forward slash (used in JS because '/' is the regex delimiter)
+                    Rule::pattern(r#"\/"#),
+                    // Escaped quotes
+                    Rule::pattern(r#"\"\'"#),
+                    // Quote preceded by a literal backslash
+                    Rule::pattern(r#"[\\']+"#),
+                ],
+                separators: vec![],
+                examples: vec![
+                    ("/", Some((0, "/"))),
+                    ("\"\'", Some((1, "\"\'"))),
+                    (r#"'\'a"#, Some((2, r#"'\'"#))),
+                ],
+            },
+            // Allowing un-escaped curly braces
+            Row {
+                rules: vec![
+                    // Un-escaped curly braces
+                    Rule::pattern(r#"u{[0-9a-fA-F]+}"#),
+                    // Already-escaped curly braces
+                    Rule::pattern(r#"\{[ab]{3}\}"#),
+                ],
+                separators: vec![],
+                examples: vec![
+                    ("u{1234} ok", Some((0, "u{1234}"))),
+                    ("{aba}}", Some((1, "{aba}"))),
+                ],
+
+            }
        ];

        for Row {