Preprocess regexes to allow non-standard escape sequences

Also allow unescaped curly braces to match literal curly braces when 
they don't form a valid repetition operator.
This commit is contained in:
Max Brunsfeld 2019-01-14 14:05:19 -08:00
parent 2e009f7177
commit e2717a6ad1
2 changed files with 65 additions and 5 deletions

View file

@ -1,5 +1,4 @@
const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi;
const DELIMITER_ESCAPE_PATTERN = /\\\//g;
function alias(rule, value) {
const result = {
@ -150,10 +149,6 @@ function normalize(value) {
return {
type: 'PATTERN',
value: value.source
.replace(
DELIMITER_ESCAPE_PATTERN,
'/'
)
.replace(
UNICODE_ESCAPE_PATTERN,
(match, group) => String.fromCharCode(parseInt(group, 16))

View file

@ -6,8 +6,15 @@ use crate::generate::rules::Rule;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
};
use regex::Regex;
use std::i32;
lazy_static! {
static ref CURLY_BRACE_REGEX: Regex = Regex::new(r#"(^|[^\\])\{([^}]*[^0-9}][^}]*)\}"#).unwrap();
}
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
struct NfaBuilder {
nfa: Nfa,
is_sep: bool,
@ -35,6 +42,31 @@ fn get_completion_precedence(rule: &Rule) -> i32 {
}
}
fn preprocess_regex(content: &str) -> String {
let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}");
let mut result = String::with_capacity(content.len());
let mut is_escaped = false;
for c in content.chars() {
if is_escaped {
if ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) {
result.push(c);
} else {
result.push('\\');
result.push(c);
}
is_escaped = false;
} else if c == '\\' {
is_escaped = true;
} else {
result.push(c);
}
}
if is_escaped {
result.push('\\');
}
result
}
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder {
nfa: Nfa::new(),
@ -90,6 +122,7 @@ impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let s = preprocess_regex(s);
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error(e.to_string()))?;
@ -586,6 +619,38 @@ mod tests {
("12e34", Some((0, "12e34"))),
],
},
// Allowing unrecognized escape sequences
Row {
rules: vec![
// Escaped forward slash (used in JS because '/' is the regex delimiter)
Rule::pattern(r#"\/"#),
// Escaped quotes
Rule::pattern(r#"\"\'"#),
// Quote preceded by a literal backslash
Rule::pattern(r#"[\\']+"#),
],
separators: vec![],
examples: vec![
("/", Some((0, "/"))),
("\"\'", Some((1, "\"\'"))),
(r#"'\'a"#, Some((2, r#"'\'"#))),
],
},
// Allowing un-escaped curly braces
Row {
rules: vec![
// Un-escaped curly braces
Rule::pattern(r#"u{[0-9a-fA-F]+}"#),
// Already-escaped curly braces
Rule::pattern(r#"\{[ab]{3}\}"#),
],
separators: vec![],
examples: vec![
("u{1234} ok", Some((0, "u{1234}"))),
("{aba}}", Some((1, "{aba}"))),
],
}
];
for Row {