From ead6ca1738c52e8da4a2eb577d1c4c50b08593b4 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Sat, 8 Dec 2018 13:44:11 -0800
Subject: [PATCH] Generate NFAs from regexes

---
 Cargo.lock                             |   1 +
 Cargo.toml                             |   1 +
 src/error.rs                           |  11 ++
 src/main.rs                            |   1 +
 src/nfa.rs                             | 160 ++++++++++++++++++
 src/prepare_grammar/normalize_rules.rs | 224 +++++++++++++++++++++++++
 src/rules.rs                           |   2 +-
 7 files changed, 399 insertions(+), 1 deletion(-)
 create mode 100644 src/nfa.rs
diff --git a/Cargo.lock b/Cargo.lock
index 20908681..d5109fb7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -466,6 +466,7 @@ dependencies = [
  "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/Cargo.toml b/Cargo.toml
index 965cc81e..93a49d2c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,3 +15,4 @@ serde = "1.0"
 serde_derive = "1.0"
 serde_json = "1.0"
 tree-sitter = "0.3.1"
+regex-syntax = "0.6.4"
diff --git a/src/error.rs b/src/error.rs
index 90e7b8f9..49064c22 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -2,10 +2,21 @@
 pub enum Error {
     GrammarError(String),
     SymbolError(String),
+    RegexError(String),
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
 
+impl Error {
+    pub fn grammar(message: &str) -> Self {
+        Error::GrammarError(message.to_string())
+    }
+
+    pub fn regex(message: &str) -> Self {
+        Error::RegexError(message.to_string())
+    }
+}
+
 impl From<serde_json::Error> for Error {
     fn from(error: serde_json::Error) -> Self {
         Error::GrammarError(error.to_string())
diff --git a/src/main.rs b/src/main.rs
index 3eeb306a..4d376929 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,6 +7,7 @@ mod build_tables;
 mod error;
 mod generate;
 mod grammars;
+mod nfa;
 mod parse_grammar;
 mod prepare_grammar;
 mod render;
diff --git a/src/nfa.rs b/src/nfa.rs
new file mode 100644
index 00000000..55aa11dc
--- /dev/null
+++ b/src/nfa.rs
@@ -0,0 +1,160 @@
+use std::fmt;
+use std::char;
+
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+pub enum CharacterSet {
+    Include(Vec<char>),
+    Exclude(Vec<char>),
+}
+
+#[derive(Debug)]
+pub enum NfaState {
+    Advance(CharacterSet, u32),
+    Split(u32, u32),
+    Accept,
+}
+
+pub struct Nfa {
+    pub states: Vec<NfaState>
+}
+
+#[derive(Debug)]
+pub struct NfaCursor<'a> {
+    indices: Vec<u32>,
+    nfa: &'a Nfa,
+}
+
+impl CharacterSet {
+    pub fn empty() -> Self {
+        CharacterSet::Include(Vec::new())
+    }
+
+    pub fn all() -> Self {
+        CharacterSet::Exclude(Vec::new())
+    }
+
+    pub fn negate(self) -> CharacterSet {
+        match self {
+            CharacterSet::Include(chars) => CharacterSet::Exclude(chars),
+            CharacterSet::Exclude(chars) => CharacterSet::Include(chars),
+        }
+    }
+
+    pub fn add_char(self, c: char) -> Self {
+        if let CharacterSet::Include(mut chars) = self {
+            if let Err(i) = chars.binary_search(&c) {
+                chars.insert(i, c);
+            }
+            CharacterSet::Include(chars)
+        } else {
+            panic!("Called add with a negated character set");
+        }
+    }
+
+    pub fn add_range(self, start: char, end: char) -> Self {
+        if let CharacterSet::Include(mut chars) = self {
+            let mut c = start as u32;
+            while c <= end as u32 {
+                chars.push(char::from_u32(c).unwrap());
+                c += 1;
+            }
+            chars.sort_unstable();
+            chars.dedup();
+            CharacterSet::Include(chars)
+        } else {
+            panic!("Called add with a negated character set");
+        }
+    }
+
+    pub fn add(self, other: CharacterSet) -> Self {
+        if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) {
+            chars.extend(other_chars);
+            chars.sort_unstable();
+            chars.dedup();
+            CharacterSet::Include(chars)
+        } else {
+            panic!("Called add with a negated character set");
+        }
+    }
+
+    pub fn contains(&self, c: char) -> bool {
+        match self {
+            CharacterSet::Include(chars) => chars.contains(&c),
+            CharacterSet::Exclude(chars) => !chars.contains(&c),
+        }
+    }
+}
+
+impl Nfa {
+    pub fn new() -> Self {
+        Nfa { states: vec![NfaState::Accept] }
+    }
+
+    pub fn start_index(&self) -> u32 {
+        self.states.len() as u32 - 1
+    }
+
+    pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
+        self.states.push(f(self.start_index()));
+    }
+}
+
+impl fmt::Debug for Nfa {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Nfa {{ states: {{")?;
+        for (i, state) in self.states.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{}: {:?}", i, state)?;
+        }
+        write!(f, "}} }}")?;
+        Ok(())
+    }
+}
+
+impl<'a> NfaCursor<'a> {
+    pub fn new(nfa: &'a Nfa) -> Self {
+        let mut result = Self { nfa, indices: Vec::new() };
+        result.add_indices(&mut vec![nfa.start_index()]);
+        result
+    }
+
+    pub fn advance(&mut self, c: char) -> bool {
+        let mut result = false;
+        let mut new_indices = Vec::new();
+        for index in &self.indices {
+            if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] {
+                if chars.contains(c) {
+                    new_indices.push(*next_index);
+                    result = true;
+                }
+            }
+        }
+        self.indices.clear();
+        self.add_indices(&mut new_indices);
+        result
+    }
+
+    pub fn is_done(&self) -> bool {
+        self.indices.iter().any(|index| {
+            if let NfaState::Accept = self.nfa.states[*index as usize] {
+                true
+            } else {
+                false
+            }
+        })
+    }
+
+    pub fn add_indices(&mut self, new_indices: &mut Vec<u32>) {
+        while let Some(index) = new_indices.pop() {
+            let state = &self.nfa.states[index as usize];
+            if let NfaState::Split(left, right) = state {
+                new_indices.push(*left);
+                new_indices.push(*right);
+            } else if let Err(i) = self.indices.binary_search(&index) {
+                self.indices.insert(i, index);
+            }
+        }
+    }
+}
diff --git a/src/prepare_grammar/normalize_rules.rs b/src/prepare_grammar/normalize_rules.rs
index 9e625ef5..67177b4f 100644
--- a/src/prepare_grammar/normalize_rules.rs
+++ b/src/prepare_grammar/normalize_rules.rs
@@ -1,5 +1,229 @@
+use crate::error::{Error, Result};
+use crate::rules::Rule;
 use crate::grammars::LexicalGrammar;
+use crate::nfa::{Nfa, NfaState, NfaCursor, CharacterSet};
+use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind};
+
+fn evaluate_perl_class(item: &ClassPerlKind) -> CharacterSet {
+    match item {
+        ClassPerlKind::Digit => CharacterSet::empty()
+            .add_range('0', '9'),
+        ClassPerlKind::Space => CharacterSet::empty()
+            .add_char(' ')
+            .add_char('\t')
+            .add_char('\r')
+            .add_char('\n'),
+        ClassPerlKind::Word => CharacterSet::empty()
+            .add_char('_')
+            .add_range('A', 'Z')
+            .add_range('a', 'z')
+            .add_range('0', '9')
+    }
+}
+
+fn evaluate_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
+    match item {
+        ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
+        ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
+        ClassSetItem::Range(range) => Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)),
+        ClassSetItem::Union(union) => {
+            let mut result = CharacterSet::empty();
+            for item in &union.items {
+                result = result.add(evaluate_character_class(&item)?);
+            }
+            Ok(result)
+        }
+        _ => Err(Error::regex("Unsupported character class syntax")),
+    }
+}
+
+fn regex_to_nfa(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
+    match ast {
+        Ast::Empty(_) => Ok(()),
+        Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
+        Ast::Literal(literal) => {
+            nfa.states.push(NfaState::Advance(CharacterSet::Include(vec![literal.c]), next_state_index));
+            Ok(())
+        },
+        Ast::Dot(_) => {
+            nfa.states.push(NfaState::Advance(CharacterSet::Exclude(vec!['\n']), next_state_index));
+            Ok(())
+        },
+        Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
+        Ast::Class(class) => match class {
+            Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
+            Class::Perl(class) => {
+                nfa.states.push(NfaState::Advance(evaluate_perl_class(&class.kind), next_state_index));
+                Ok(())
+            },
+            Class::Bracketed(class) => match &class.kind {
+                ClassSet::Item(item) => {
+                    let character_set = evaluate_character_class(&item)?;
+                    nfa.states.push(NfaState::Advance(character_set, next_state_index));
+                    Ok(())
+                },
+                ClassSet::BinaryOp(_) => {
+                    Err(Error::regex("Binary operators in character classes aren't supported"))
+                }
+            }
+        },
+        Ast::Repetition(repetition) => match repetition.op.kind {
+            RepetitionKind::ZeroOrOne => {
+                regex_to_nfa(&repetition.ast, nfa, next_state_index)?;
+                nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index));
+                Ok(())
+            },
+            RepetitionKind::OneOrMore => {
+                nfa.states.push(NfaState::Accept); // Placeholder for split
+                let split_index = nfa.start_index();
+                regex_to_nfa(&repetition.ast, nfa, split_index)?;
+                nfa.states[split_index as usize] = NfaState::Split(
+                    nfa.start_index(),
+                    next_state_index
+                );
+                Ok(())
+            },
+            RepetitionKind::ZeroOrMore => {
+                nfa.states.push(NfaState::Accept); // Placeholder for split
+                let split_index = nfa.start_index();
+                regex_to_nfa(&repetition.ast, nfa, split_index)?;
+                nfa.states[split_index as usize] = NfaState::Split(
+                    nfa.start_index(),
+                    next_state_index
+                );
+                nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index));
+                Ok(())
+            },
+            RepetitionKind::Range(_) => unimplemented!(),
+        },
+        Ast::Group(group) => regex_to_nfa(&group.ast, nfa, nfa.start_index()),
+        Ast::Alternation(alternation) => {
+            let mut alternative_start_indices = Vec::new();
+            for ast in alternation.asts.iter() {
+                regex_to_nfa(&ast, nfa, next_state_index)?;
+                alternative_start_indices.push(nfa.start_index());
+            }
+            alternative_start_indices.pop();
+            for alternative_start_index in alternative_start_indices {
+                nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
+            }
+            Ok(())
+        },
+        Ast::Concat(concat) => {
+            for ast in concat.asts.iter().rev() {
+                regex_to_nfa(&ast, nfa, next_state_index)?;
+                next_state_index = nfa.start_index();
+            }
+            Ok(())
+        }
+    }
+}
+
+fn expand_rule(rule: Rule) -> Result<Nfa> {
+    match rule {
+        Rule::Pattern(s) => {
+            let ast = parse::Parser::new().parse(&s).map_err(|e| Error::GrammarError(e.to_string()))?;
+            let mut nfa = Nfa::new();
+            regex_to_nfa(&ast, &mut nfa, 0)?;
+            Ok(nfa)
+        },
+        Rule::String(s) => {
+            let mut nfa = Nfa::new();
+            for c in s.chars().rev() {
+                nfa.prepend(|start_index| NfaState::Advance(CharacterSet::empty().add_char(c), start_index));
+            }
+            Ok(nfa)
+        },
+        _ => Err(Error::grammar("Unexpected rule type")),
+    }
+}
 
 pub(super) fn normalize_rules(grammar: LexicalGrammar) -> LexicalGrammar {
     unimplemented!();
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> {
+        let mut result = None;
+        let mut char_count = 0;
+        let mut cursor = NfaCursor::new(nfa);
+        for c in s.chars() {
+            if cursor.is_done() {
+                result = Some(&s[0..char_count]);
+            }
+            if cursor.advance(c) {
+                char_count += 1;
+            } else {
+                break;
+            }
+        }
+        result
+    }
+
+    #[test]
+    fn test_regex_expansion() {
+        struct Row {
+            pattern: &'static str,
+            examples: Vec<(&'static str, Option<&'static str>)>,
+        }
+
+        let table = [
+            Row {
+                pattern: "a|bc",
+                examples: vec![
+                    ("a12", Some("a")),
+                    ("bc12", Some("bc")),
+                    ("b12", None),
+                    ("c12", None),
+                ],
+            },
+            Row {
+                pattern: "(a|b|c)d(e|f|g)h?",
+                examples: vec![
+                    ("ade1", Some("ade")),
+                    ("bdf1", Some("bdf")),
+                    ("bdfh1", Some("bdfh")),
+                    ("ad1", None),
+                ],
+            },
+            Row {
+                pattern: "a*",
+                examples: vec![
+                    ("aaa1", Some("aaa")),
+                    ("b", Some("")),
+                ],
+            },
+            Row {
+                pattern: "a((bc)+|(de)*)f",
+                examples: vec![
+                    ("af1", Some("af")),
+                    ("adedef1", Some("adedef")),
+                    ("abcbcbcf1", Some("abcbcbcf")),
+                    ("a", None),
+                ],
+            },
+            Row {
+                pattern: "[a-fA-F0-9]+",
+                examples: vec![
+                    ("A1ff0", Some("A1ff")),
+                ],
+            },
+            Row {
+                pattern: "\\w\\d\\s",
+                examples: vec![
+                    ("_0  ", Some("_0 ")),
+                ],
+            },
+        ];
+
+        for Row { pattern, examples } in table.iter() {
+            let nfa = expand_rule(Rule::pattern(pattern)).unwrap();
+            for (haystack, needle) in examples.iter() {
+                assert_eq!(simulate_nfa(&nfa, haystack), *needle);
+            }
+        }
+    }
+}
diff --git a/src/rules.rs b/src/rules.rs
index 5c3b65fd..b593496a 100644
--- a/src/rules.rs
+++ b/src/rules.rs
@@ -1,4 +1,5 @@
 use std::rc::Rc;
+use std::char;
 use std::collections::HashMap;
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
@@ -44,7 +45,6 @@ pub(crate) struct Symbol {
 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub(crate) enum Rule {
     Blank,
-    CharacterSet(Vec<char>),
     String(String),
     Pattern(String),
     NamedSymbol(String),