Generate NFAs from regexes
This commit is contained in:
parent
0688a5edd3
commit
ead6ca1738
7 changed files with 399 additions and 1 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -466,6 +466,7 @@ dependencies = [
|
|||
"dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
|
|
|||
|
|
@ -15,3 +15,4 @@ serde = "1.0"
|
|||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
tree-sitter = "0.3.1"
|
||||
regex-syntax = "0.6.4"
|
||||
|
|
|
|||
11
src/error.rs
11
src/error.rs
|
|
@ -2,10 +2,21 @@
|
|||
pub enum Error {
|
||||
GrammarError(String),
|
||||
SymbolError(String),
|
||||
RegexError(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl Error {
|
||||
pub fn grammar(message: &str) -> Self {
|
||||
Error::GrammarError(message.to_string())
|
||||
}
|
||||
|
||||
pub fn regex(message: &str) -> Self {
|
||||
Error::RegexError(message.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for Error {
|
||||
fn from(error: serde_json::Error) -> Self {
|
||||
Error::GrammarError(error.to_string())
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ mod build_tables;
|
|||
mod error;
|
||||
mod generate;
|
||||
mod grammars;
|
||||
mod nfa;
|
||||
mod parse_grammar;
|
||||
mod prepare_grammar;
|
||||
mod render;
|
||||
|
|
|
|||
160
src/nfa.rs
Normal file
160
src/nfa.rs
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
use std::fmt;
|
||||
use std::char;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum CharacterSet {
|
||||
Include(Vec<char>),
|
||||
Exclude(Vec<char>),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum NfaState {
|
||||
Advance(CharacterSet, u32),
|
||||
Split(u32, u32),
|
||||
Accept,
|
||||
}
|
||||
|
||||
pub struct Nfa {
|
||||
pub states: Vec<NfaState>
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct NfaCursor<'a> {
|
||||
indices: Vec<u32>,
|
||||
nfa: &'a Nfa,
|
||||
}
|
||||
|
||||
impl CharacterSet {
|
||||
pub fn empty() -> Self {
|
||||
CharacterSet::Include(Vec::new())
|
||||
}
|
||||
|
||||
pub fn all() -> Self {
|
||||
CharacterSet::Exclude(Vec::new())
|
||||
}
|
||||
|
||||
pub fn negate(self) -> CharacterSet {
|
||||
match self {
|
||||
CharacterSet::Include(chars) => CharacterSet::Exclude(chars),
|
||||
CharacterSet::Exclude(chars) => CharacterSet::Include(chars),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_char(self, c: char) -> Self {
|
||||
if let CharacterSet::Include(mut chars) = self {
|
||||
if let Err(i) = chars.binary_search(&c) {
|
||||
chars.insert(i, c);
|
||||
}
|
||||
CharacterSet::Include(chars)
|
||||
} else {
|
||||
panic!("Called add with a negated character set");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_range(self, start: char, end: char) -> Self {
|
||||
if let CharacterSet::Include(mut chars) = self {
|
||||
let mut c = start as u32;
|
||||
while c <= end as u32 {
|
||||
chars.push(char::from_u32(c).unwrap());
|
||||
c += 1;
|
||||
}
|
||||
chars.sort_unstable();
|
||||
chars.dedup();
|
||||
CharacterSet::Include(chars)
|
||||
} else {
|
||||
panic!("Called add with a negated character set");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(self, other: CharacterSet) -> Self {
|
||||
if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) {
|
||||
chars.extend(other_chars);
|
||||
chars.sort_unstable();
|
||||
chars.dedup();
|
||||
CharacterSet::Include(chars)
|
||||
} else {
|
||||
panic!("Called add with a negated character set");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn contains(&self, c: char) -> bool {
|
||||
match self {
|
||||
CharacterSet::Include(chars) => chars.contains(&c),
|
||||
CharacterSet::Exclude(chars) => !chars.contains(&c),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Nfa {
|
||||
pub fn new() -> Self {
|
||||
Nfa { states: vec![NfaState::Accept] }
|
||||
}
|
||||
|
||||
pub fn start_index(&self) -> u32 {
|
||||
self.states.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
|
||||
self.states.push(f(self.start_index()));
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Nfa {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Nfa {{ states: {{")?;
|
||||
for (i, state) in self.states.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "{}: {:?}", i, state)?;
|
||||
}
|
||||
write!(f, "}} }}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> NfaCursor<'a> {
|
||||
pub fn new(nfa: &'a Nfa) -> Self {
|
||||
let mut result = Self { nfa, indices: Vec::new() };
|
||||
result.add_indices(&mut vec![nfa.start_index()]);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn advance(&mut self, c: char) -> bool {
|
||||
let mut result = false;
|
||||
let mut new_indices = Vec::new();
|
||||
for index in &self.indices {
|
||||
if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] {
|
||||
if chars.contains(c) {
|
||||
new_indices.push(*next_index);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.indices.clear();
|
||||
self.add_indices(&mut new_indices);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
self.indices.iter().any(|index| {
|
||||
if let NfaState::Accept = self.nfa.states[*index as usize] {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_indices(&mut self, new_indices: &mut Vec<u32>) {
|
||||
while let Some(index) = new_indices.pop() {
|
||||
let state = &self.nfa.states[index as usize];
|
||||
if let NfaState::Split(left, right) = state {
|
||||
new_indices.push(*left);
|
||||
new_indices.push(*right);
|
||||
} else if let Err(i) = self.indices.binary_search(&index) {
|
||||
self.indices.insert(i, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,229 @@
|
|||
use crate::error::{Error, Result};
|
||||
use crate::rules::Rule;
|
||||
use crate::grammars::LexicalGrammar;
|
||||
use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet};
|
||||
use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind};
|
||||
|
||||
fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::empty()
|
||||
.add_range('0', '9'),
|
||||
ClassPerlKind::Space => CharacterSet::empty()
|
||||
.add_char(' ')
|
||||
.add_char('\t')
|
||||
.add_char('\r')
|
||||
.add_char('\n'),
|
||||
ClassPerlKind::Word => CharacterSet::empty()
|
||||
.add_char('_')
|
||||
.add_range('A', 'Z')
|
||||
.add_range('a', 'z')
|
||||
.add_range('0', '9')
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
|
||||
match item {
|
||||
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
|
||||
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
|
||||
ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)),
|
||||
ClassSetItem::Union(union) => {
|
||||
let mut result = CharacterSet::empty();
|
||||
for item in &union.items {
|
||||
result = result.add(evaluate_character_class(&item)?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
_ => Err(Error::regex("Unsupported character class syntax")),
|
||||
}
|
||||
}
|
||||
|
||||
fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(()),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
|
||||
Ast::Literal(literal) => {
|
||||
nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index));
|
||||
Ok(())
|
||||
},
|
||||
Ast::Dot(_) => {
|
||||
nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index));
|
||||
Ok(())
|
||||
},
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
|
||||
Ast::Class(class) => match class {
|
||||
Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
|
||||
Class::Perl(class) => {
|
||||
nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index));
|
||||
Ok(())
|
||||
},
|
||||
Class::Bracketed(class) => match &class.kind {
|
||||
ClassSet::Item(item) => {
|
||||
let character_set = evaluate_character_class(&item)?;
|
||||
nfa.states.push(NfaState::Advance(character_set, next_state_index));
|
||||
Ok(())
|
||||
},
|
||||
ClassSet::BinaryOp(_) => {
|
||||
Err(Error::regex("Binary operators in character classes aren't supported"))
|
||||
}
|
||||
}
|
||||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
regex_to_nfa(&repetition.ast, nfa, next_state_index)?;
|
||||
nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index));
|
||||
Ok(())
|
||||
},
|
||||
RepetitionKind::OneOrMore => {
|
||||
nfa.states.push(NfaState::Accept); // Placeholder for split
|
||||
let split_index = nfa.start_index();
|
||||
regex_to_nfa(&repetition.ast, nfa, split_index)?;
|
||||
nfa.states[split_index as usize] = NfaState::Split(
|
||||
nfa.start_index(),
|
||||
next_state_index
|
||||
);
|
||||
Ok(())
|
||||
},
|
||||
RepetitionKind::ZeroOrMore => {
|
||||
nfa.states.push(NfaState::Accept); // Placeholder for split
|
||||
let split_index = nfa.start_index();
|
||||
regex_to_nfa(&repetition.ast, nfa, split_index)?;
|
||||
nfa.states[split_index as usize] = NfaState::Split(
|
||||
nfa.start_index(),
|
||||
next_state_index
|
||||
);
|
||||
nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index));
|
||||
Ok(())
|
||||
},
|
||||
RepetitionKind::Range(_) => unimplemented!(),
|
||||
},
|
||||
Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()),
|
||||
Ast::Alternation(alternation) => {
|
||||
let mut alternative_start_indices = Vec::new();
|
||||
for ast in alternation.asts.iter() {
|
||||
regex_to_nfa(&ast, nfa, next_state_index)?;
|
||||
alternative_start_indices.push(nfa.start_index());
|
||||
}
|
||||
alternative_start_indices.pop();
|
||||
for alternative_start_index in alternative_start_indices {
|
||||
nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
Ast::Concat(concat) => {
|
||||
for ast in concat.asts.iter().rev() {
|
||||
regex_to_nfa(&ast, nfa, next_state_index)?;
|
||||
next_state_index = nfa.start_index();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_rule(rule: Rule) -> Result<Nfa> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
let mut nfa = Nfa::new();
|
||||
regex_to_nfa(&ast, &mut nfa, 0)?;
|
||||
Ok(nfa)
|
||||
},
|
||||
Rule::String(s) => {
|
||||
let mut nfa = Nfa::new();
|
||||
for c in s.chars().rev() {
|
||||
nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index));
|
||||
}
|
||||
Ok(nfa)
|
||||
},
|
||||
_ => Err(Error::grammar("Unexpected rule type")),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> {
|
||||
let mut result = None;
|
||||
let mut char_count = 0;
|
||||
let mut cursor = NfaCursor::new(nfa);
|
||||
for c in s.chars() {
|
||||
if cursor.is_done() {
|
||||
result = Some(&s[0..char_count]);
|
||||
}
|
||||
if cursor.advance(c) {
|
||||
char_count += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regex_expansion() {
|
||||
struct Row {
|
||||
pattern: &'static str,
|
||||
examples: Vec<(&'static str, Option<&'static str>)>,
|
||||
}
|
||||
|
||||
let table = [
|
||||
Row {
|
||||
pattern: "a|bc",
|
||||
examples: vec![
|
||||
("a12", Some("a")),
|
||||
("bc12", Some("bc")),
|
||||
("b12", None),
|
||||
("c12", None),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
pattern: "(a|b|c)d(e|f|g)h?",
|
||||
examples: vec![
|
||||
("ade1", Some("ade")),
|
||||
("bdf1", Some("bdf")),
|
||||
("bdfh1", Some("bdfh")),
|
||||
("ad1", None),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
pattern: "a*",
|
||||
examples: vec![
|
||||
("aaa1", Some("aaa")),
|
||||
("b", Some("")),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
pattern: "a((bc)+|(de)*)f",
|
||||
examples: vec![
|
||||
("af1", Some("af")),
|
||||
("adedef1", Some("adedef")),
|
||||
("abcbcbcf1", Some("abcbcbcf")),
|
||||
("a", None),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
pattern: "[a-fA-F0-9]+",
|
||||
examples: vec![
|
||||
("A1ff0", Some("A1ff")),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
pattern: "\\w\\d\\s",
|
||||
examples: vec![
|
||||
("_0 ", Some("_0 ")),
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for Row { pattern, examples } in table.iter() {
|
||||
let nfa = expand_rule(Rule::pattern(pattern)).unwrap();
|
||||
for (haystack, needle) in examples.iter() {
|
||||
assert_eq!(simulate_nfa(&nfa, haystack), *needle);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use std::rc::Rc;
|
||||
use std::char;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||
|
|
@ -44,7 +45,6 @@ pub(crate) struct Symbol {
|
|||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub(crate) enum Rule {
|
||||
Blank,
|
||||
CharacterSet(Vec<char>),
|
||||
String(String),
|
||||
Pattern(String),
|
||||
NamedSymbol(String),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue