From ead6ca1738c52e8da4a2eb577d1c4c50b08593b4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 8 Dec 2018 13:44:11 -0800 Subject: [PATCH] Generate NFAs from regexes --- Cargo.lock | 1 + Cargo.toml | 1 + src/error.rs | 11 ++ src/main.rs | 1 + src/nfa.rs | 160 ++++++++++++++++++ src/prepare_grammar/normalize_rules.rs | 224 +++++++++++++++++++++++++ src/rules.rs | 2 +- 7 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 src/nfa.rs diff --git a/Cargo.lock b/Cargo.lock index 20908681..d5109fb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -466,6 +466,7 @@ dependencies = [ "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 965cc81e..93a49d2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tree-sitter = "0.3.1" +regex-syntax = "0.6.4" diff --git a/src/error.rs b/src/error.rs index 90e7b8f9..49064c22 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,10 +2,21 @@ pub enum Error { GrammarError(String), SymbolError(String), + RegexError(String), } pub type Result = std::result::Result; +impl Error { + pub fn grammar(message: &str) -> Self { + Error::GrammarError(message.to_string()) + } + + pub fn regex(message: &str) -> Self { + Error::RegexError(message.to_string()) + } +} + impl From for Error { fn from(error: serde_json::Error) -> Self { Error::GrammarError(error.to_string()) diff --git a/src/main.rs b/src/main.rs index 3eeb306a..4d376929 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod build_tables; mod error; mod generate; mod grammars; +mod nfa; mod parse_grammar; mod prepare_grammar; mod render; diff --git a/src/nfa.rs b/src/nfa.rs new file mode 100644 index 00000000..55aa11dc --- /dev/null +++ b/src/nfa.rs @@ -0,0 +1,160 @@ +use std::fmt; +use std::char; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum CharacterSet { + Include(Vec), + Exclude(Vec), +} + +#[derive(Debug)] +pub enum NfaState { + Advance(CharacterSet, u32), + Split(u32, u32), + Accept, +} + +pub struct Nfa { + pub states: Vec +} + +#[derive(Debug)] +pub struct NfaCursor<'a> { + indices: Vec, + nfa: &'a Nfa, +} + +impl CharacterSet { + pub fn empty() -> Self { + CharacterSet::Include(Vec::new()) + } + + pub fn all() -> Self { + CharacterSet::Exclude(Vec::new()) + } + + pub fn negate(self) -> CharacterSet { + match self { + CharacterSet::Include(chars) => CharacterSet::Exclude(chars), + CharacterSet::Exclude(chars) => CharacterSet::Include(chars), + } + } + + pub fn add_char(self, c: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + if let Err(i) = chars.binary_search(&c) { + chars.insert(i, c); + } + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add_range(self, start: char, end: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + let mut c = start as u32; + while c <= end as u32 { + chars.push(char::from_u32(c).unwrap()); + c += 1; + } + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add(self, other: CharacterSet) -> Self { + if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn contains(&self, c: char) -> bool { + match self { + CharacterSet::Include(chars) => chars.contains(&c), + CharacterSet::Exclude(chars) => !chars.contains(&c), + } + } +} + +impl Nfa { + pub fn new() -> Self { + Nfa { states: vec![NfaState::Accept] } + } + + pub fn start_index(&self) -> u32 { + self.states.len() as u32 - 1 + } + + pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { + self.states.push(f(self.start_index())); + } +} + +impl fmt::Debug for Nfa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Nfa {{ states: {{")?; + for (i, state) in self.states.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {:?}", i, state)?; + } + write!(f, "}} }}")?; + Ok(()) + } +} + +impl<'a> NfaCursor<'a> { + pub fn new(nfa: &'a Nfa) -> Self { + let mut result = Self { nfa, indices: Vec::new() }; + result.add_indices(&mut vec![nfa.start_index()]); + result + } + + pub fn advance(&mut self, c: char) -> bool { + let mut result = false; + let mut new_indices = Vec::new(); + for index in &self.indices { + if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if chars.contains(c) { + new_indices.push(*next_index); + result = true; + } + } + } + self.indices.clear(); + self.add_indices(&mut new_indices); + result + } + + pub fn is_done(&self) -> bool { + self.indices.iter().any(|index| { + if let NfaState::Accept = self.nfa.states[*index as usize] { + true + } else { + false + } + }) + } + + pub fn add_indices(&mut self, new_indices: &mut Vec) { + while let Some(index) = new_indices.pop() { + let state = &self.nfa.states[index as usize]; + if let NfaState::Split(left, right) = state { + new_indices.push(*left); + new_indices.push(*right); + } else if let Err(i) = self.indices.binary_search(&index) { + self.indices.insert(i, index); + } + } + } +} diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs index 9e625ef5..67177b4f 100644 --- a/src/prepare_grammar/normalize_rules.rs +++ b/src/prepare_grammar/normalize_rules.rs @@ -1,5 +1,229 @@ +use crate::error::{Error, Result}; +use crate::rules::Rule; use crate::grammars::LexicalGrammar; +use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet}; +use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; + +fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty() + .add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9') + } +} + +fn evaluate_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)), + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(evaluate_character_class(&item)?); + } + Ok(result) + } + _ => Err(Error::regex("Unsupported character class syntax")), + } +} + +fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { + match ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index)); + Ok(()) + }, + Ast::Dot(_) => { + nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index)); + Ok(()) + }, + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), + Class::Perl(class) => { + nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index)); + Ok(()) + }, + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + let character_set = evaluate_character_class(&item)?; + nfa.states.push(NfaState::Advance(character_set, next_state_index)); + Ok(()) + }, + ClassSet::BinaryOp(_) => { + Err(Error::regex("Binary operators in character classes aren't supported")) + } + } + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + regex_to_nfa(&repetition.ast, nfa, next_state_index)?; + nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); + Ok(()) + }, + RepetitionKind::OneOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + Ok(()) + }, + RepetitionKind::ZeroOrMore => { + nfa.states.push(NfaState::Accept); // Placeholder for split + let split_index = nfa.start_index(); + regex_to_nfa(&repetition.ast, nfa, split_index)?; + nfa.states[split_index as usize] = NfaState::Split( + nfa.start_index(), + next_state_index + ); + nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); + Ok(()) + }, + RepetitionKind::Range(_) => unimplemented!(), + }, + Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()), + Ast::Alternation(alternation) => { + let mut alternative_start_indices = Vec::new(); + for ast in alternation.asts.iter() { + regex_to_nfa(&ast, nfa, next_state_index)?; + alternative_start_indices.push(nfa.start_index()); + } + alternative_start_indices.pop(); + for alternative_start_index in alternative_start_indices { + nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + } + Ok(()) + }, + Ast::Concat(concat) => { + for ast in concat.asts.iter().rev() { + regex_to_nfa(&ast, nfa, next_state_index)?; + next_state_index = nfa.start_index(); + } + Ok(()) + } + } +} + +fn expand_rule(rule: Rule) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?; + let mut nfa = Nfa::new(); + regex_to_nfa(&ast, &mut nfa, 0)?; + Ok(nfa) + }, + Rule::String(s) => { + let mut nfa = Nfa::new(); + for c in s.chars().rev() { + nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index)); + } + Ok(nfa) + }, + _ => Err(Error::grammar("Unexpected rule type")), + } +} pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar { unimplemented!(); } + +#[cfg(test)] +mod tests { + use super::*; + + fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { + let mut result = None; + let mut char_count = 0; + let mut cursor = NfaCursor::new(nfa); + for c in s.chars() { + if cursor.is_done() { + result = Some(&s[0..char_count]); + } + if cursor.advance(c) { + char_count += 1; + } else { + break; + } + } + result + } + + #[test] + fn test_regex_expansion() { + struct Row { + pattern: &'static str, + examples: Vec<(&'static str, Option<&'static str>)>, + } + + let table = [ + Row { + pattern: "a|bc", + examples: vec![ + ("a12", Some("a")), + ("bc12", Some("bc")), + ("b12", None), + ("c12", None), + ], + }, + Row { + pattern: "(a|b|c)d(e|f|g)h?", + examples: vec![ + ("ade1", Some("ade")), + ("bdf1", Some("bdf")), + ("bdfh1", Some("bdfh")), + ("ad1", None), + ], + }, + Row { + pattern: "a*", + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], + }, + Row { + pattern: "a((bc)+|(de)*)f", + examples: vec![ + ("af1", Some("af")), + ("adedef1", Some("adedef")), + ("abcbcbcf1", Some("abcbcbcf")), + ("a", None), + ], + }, + Row { + pattern: "[a-fA-F0-9]+", + examples: vec![ + ("A1ff0", Some("A1ff")), + ], + }, + Row { + pattern: "\\w\\d\\s", + examples: vec![ + ("_0 ", Some("_0 ")), + ], + }, + ]; + + for Row { pattern, examples } in table.iter() { + let nfa = expand_rule(Rule::pattern(pattern)).unwrap(); + for (haystack, needle) in examples.iter() { + assert_eq!(simulate_nfa(&nfa, haystack), *needle); + } + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 5c3b65fd..b593496a 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,5 @@ use std::rc::Rc; +use std::char; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -44,7 +45,6 @@ pub(crate) struct Symbol { #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) enum Rule { Blank, - CharacterSet(Vec), String(String), Pattern(String), NamedSymbol(String),