refactor!: remove redundant escape regex & curly brace regex preprocessing
The regex-syntax crate now natively supports literal escapes for all ASCII characters except those in [0-9A-Za-z<>].
This commit is contained in:
parent
aa0f3c21b1
commit
78cc77e7b2
1 changed files with 2 additions and 32 deletions
|
|
@ -4,7 +4,6 @@ use crate::generate::nfa::{CharacterSet, Nfa, NfaState};
|
|||
use crate::generate::rules::{Precedence, Rule};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use regex_syntax::ast::{
|
||||
parse, Ast, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem, ClassUnicodeKind,
|
||||
RepetitionKind, RepetitionRange,
|
||||
|
|
@ -13,8 +12,6 @@ use std::collections::HashMap;
|
|||
use std::i32;
|
||||
|
||||
lazy_static! {
|
||||
static ref CURLY_BRACE_REGEX: Regex =
|
||||
Regex::new(r"(^|[^\\pP])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}").unwrap();
|
||||
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
|
||||
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
|
||||
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
|
||||
|
|
@ -29,7 +26,6 @@ const UNICODE_CATEGORIES_JSON: &str = include_str!("./unicode-categories.json");
|
|||
const UNICODE_PROPERTIES_JSON: &str = include_str!("./unicode-properties.json");
|
||||
const UNICODE_CATEGORY_ALIASES_JSON: &str = include_str!("./unicode-category-aliases.json");
|
||||
const UNICODE_PROPERTY_ALIASES_JSON: &str = include_str!("./unicode-property-aliases.json");
|
||||
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
|
||||
|
||||
struct NfaBuilder {
|
||||
nfa: Nfa,
|
||||
|
|
@ -60,29 +56,6 @@ const fn get_completion_precedence(rule: &Rule) -> i32 {
|
|||
0
|
||||
}
|
||||
|
||||
fn preprocess_regex(content: &str) -> String {
|
||||
let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}");
|
||||
let mut result = String::with_capacity(content.len());
|
||||
let mut is_escaped = false;
|
||||
for c in content.chars() {
|
||||
if is_escaped {
|
||||
if !ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) {
|
||||
result.push('\\');
|
||||
}
|
||||
result.push(c);
|
||||
is_escaped = false;
|
||||
} else if c == '\\' {
|
||||
is_escaped = true;
|
||||
} else {
|
||||
result.push(c);
|
||||
}
|
||||
}
|
||||
if is_escaped {
|
||||
result.push('\\');
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut builder = NfaBuilder {
|
||||
nfa: Nfa::new(),
|
||||
|
|
@ -138,8 +111,7 @@ impl NfaBuilder {
|
|||
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s, f) => {
|
||||
let s = preprocess_regex(s);
|
||||
let ast = parse::Parser::new().parse(&s)?;
|
||||
let ast = parse::Parser::new().parse(s)?;
|
||||
self.expand_regex(&ast, next_state_id, f.contains('i'))
|
||||
}
|
||||
Rule::String(s) => {
|
||||
|
|
@ -847,11 +819,9 @@ mod tests {
|
|||
("\u{00df}", Some((3, "\u{00df}"))),
|
||||
],
|
||||
},
|
||||
// allowing un-escaped curly braces
|
||||
Row {
|
||||
rules: vec![
|
||||
// Un-escaped curly braces
|
||||
Rule::pattern(r"u{[0-9a-fA-F]+}", ""),
|
||||
Rule::pattern(r"u\{[0-9a-fA-F]+\}", ""),
|
||||
// Already-escaped curly braces
|
||||
Rule::pattern(r"\{[ab]{3}\}", ""),
|
||||
// Unicode codepoints
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue