Integrate separator rules into lexer nfa

2018-12-12 18:04:29 -08:00 · 2018-12-12 18:04:29 -08:00 · 0103a83f3f
commit 0103a83f3f
parent 40d24097ec
5 changed files with 199 additions and 123 deletions
--- a/src/grammars.rs
+++ b/src/grammars.rs
@ -35,13 +35,13 @@ pub(crate) struct InputGrammar {
 pub(crate) struct LexicalVariable {
    pub name: String,
    pub kind: VariableType,
-    pub nfa: Nfa,
+    pub start_state: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub(crate) struct LexicalGrammar {
+    pub nfa: Nfa,
    pub variables: Vec<LexicalVariable>,
-    pub separators: Vec<Nfa>,
 }

 // Extracted syntax grammar
--- a/src/nfa.rs
+++ b/src/nfa.rs
@ -9,9 +9,13 @@ pub enum CharacterSet {

 #[derive(Debug, PartialEq, Eq)]
 pub enum NfaState {
-    Advance(CharacterSet, u32),
+    Advance {
+        chars: CharacterSet,
+        state: u32,
+        is_sep: bool,
+    },
    Split(u32, u32),
-    Accept,
+    Accept(usize),
 }

 #[derive(PartialEq, Eq)]
@ -23,6 +27,7 @@ pub struct Nfa {
 pub struct NfaCursor<'a> {
    indices: Vec<u32>,
    nfa: &'a Nfa,
+    in_sep: bool,
 }

 impl CharacterSet {
@ -88,15 +93,15 @@ impl CharacterSet {

 impl Nfa {
    pub fn new() -> Self {
-        Nfa { states: vec![NfaState::Accept] }
+        Nfa { states: Vec::new() }
    }

-    pub fn start_index(&self) -> u32 {
+    pub fn last_state(&self) -> u32 {
        self.states.len() as u32 - 1
    }

    pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
-        self.states.push(f(self.start_index()));
+        self.states.push(f(self.last_state()));
    }
 }

@ -116,38 +121,45 @@ impl fmt::Debug for Nfa {

 impl<'a> NfaCursor<'a> {
    pub fn new(nfa: &'a Nfa) -> Self {
-        let mut result = Self { nfa, indices: Vec::new() };
-        result.add_indices(&mut vec![nfa.start_index()]);
+        let mut result = Self { nfa, indices: Vec::new(), in_sep: true };
+        result.add_states(&mut vec![nfa.last_state()]);
        result
    }

    pub fn advance(&mut self, c: char) -> bool {
        let mut result = false;
        let mut new_indices = Vec::new();
+        let mut any_sep_transitions = false;
        for index in &self.indices {
-            if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] {
+            if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] {
+                if *is_sep {
+                    any_sep_transitions = true;
+                }
                if chars.contains(c) {
-                    new_indices.push(*next_index);
+                    new_indices.push(*state);
                    result = true;
                }
            }
        }
+        if !any_sep_transitions {
+            self.in_sep = false;
+        }
        self.indices.clear();
-        self.add_indices(&mut new_indices);
+        self.add_states(&mut new_indices);
        result
    }

-    pub fn is_done(&self) -> bool {
-        self.indices.iter().any(|index| {
-            if let NfaState::Accept = self.nfa.states[*index as usize] {
-                true
+    pub fn finished_ids<'b>(&'b self) -> impl Iterator<Item = usize> + 'b {
+        self.indices.iter().filter_map(move |index| {
+            if let NfaState::Accept(i) = self.nfa.states[*index as usize] {
+                Some(i)
            } else {
-                false
+                None
            }
        })
    }

-    pub fn add_indices(&mut self, new_indices: &mut Vec<u32>) {
+    pub fn add_states(&mut self, new_indices: &mut Vec<u32>) {
        while let Some(index) = new_indices.pop() {
            let state = &self.nfa.states[index as usize];
            if let NfaState::Split(left, right) = state {
--- a/src/prepare_grammar/expand_tokens.rs
+++ b/src/prepare_grammar/expand_tokens.rs
@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
    }
 }

-fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
+fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
    match ast {
-        Ast::Empty(_) => Ok(()),
+        Ast::Empty(_) => Ok(false),
        Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
        Ast::Literal(literal) => {
-            nfa.states.push(NfaState::Advance(
-                CharacterSet::Include(vec![literal.c]),
-                next_state_index,
-            ));
-            Ok(())
+            nfa.states.push(NfaState::Advance {
+                chars: CharacterSet::Include(vec![literal.c]),
+                state: next_state_index,
+                is_sep,
+            });
+            Ok(true)
        }
        Ast::Dot(_) => {
-            nfa.states.push(NfaState::Advance(
-                CharacterSet::Exclude(vec!['\n']),
-                next_state_index,
-            ));
-            Ok(())
+            nfa.states.push(NfaState::Advance {
+                chars: CharacterSet::Exclude(vec!['\n']),
+                state: next_state_index,
+                is_sep,
+            });
+            Ok(true)
        }
        Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
        Ast::Class(class) => match class {
            Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
            Class::Perl(class) => {
-                nfa.states.push(NfaState::Advance(
-                    expand_perl_character_class(&class.kind),
-                    next_state_index,
-                ));
-                Ok(())
+                nfa.states.push(NfaState::Advance {
+                    chars: expand_perl_character_class(&class.kind),
+                    state: next_state_index,
+                    is_sep,
+                });
+                Ok(true)
            }
            Class::Bracketed(class) => match &class.kind {
                ClassSet::Item(item) => {
                    let character_set = expand_character_class(&item)?;
-                    nfa.states
-                        .push(NfaState::Advance(character_set, next_state_index));
-                    Ok(())
+                    nfa.states.push(NfaState::Advance {
+                        chars: character_set,
+                        state: next_state_index,
+                        is_sep,
+                    });
+                    Ok(true)
                }
                ClassSet::BinaryOp(_) => Err(Error::regex(
                    "Binary operators in character classes aren't supported",
@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<(
        },
        Ast::Repetition(repetition) => match repetition.op.kind {
            RepetitionKind::ZeroOrOne => {
-                expand_regex(&repetition.ast, nfa, next_state_index)?;
-                nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index));
-                Ok(())
+                if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? {
+                    nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state));
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
            }
            RepetitionKind::OneOrMore => {
-                nfa.states.push(NfaState::Accept); // Placeholder for split
-                let split_index = nfa.start_index();
-                expand_regex(&repetition.ast, nfa, split_index)?;
-                nfa.states[split_index as usize] =
-                    NfaState::Split(nfa.start_index(), next_state_index);
-                Ok(())
+                nfa.states.push(NfaState::Accept(0)); // Placeholder for split
+                let split_index = nfa.last_state();
+                if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
+                    nfa.states[split_index as usize] =
+                        NfaState::Split(nfa.last_state(), next_state_index);
+                    Ok(true)
+                } else {
+                    nfa.states.pop();
+                    Ok(false)
+                }
            }
            RepetitionKind::ZeroOrMore => {
-                nfa.states.push(NfaState::Accept); // Placeholder for split
-                let split_index = nfa.start_index();
-                expand_regex(&repetition.ast, nfa, split_index)?;
-                nfa.states[split_index as usize] =
-                    NfaState::Split(nfa.start_index(), next_state_index);
-                nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index));
-                Ok(())
+                nfa.states.push(NfaState::Accept(0)); // Placeholder for split
+                let split_index = nfa.last_state();
+                if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
+                    nfa.states[split_index as usize] =
+                        NfaState::Split(nfa.last_state(), next_state_index);
+                    nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index));
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
            }
            RepetitionKind::Range(_) => unimplemented!(),
        },
-        Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()),
+        Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep),
        Ast::Alternation(alternation) => {
            let mut alternative_start_indices = Vec::new();
            for ast in alternation.asts.iter() {
-                expand_regex(&ast, nfa, next_state_index)?;
-                alternative_start_indices.push(nfa.start_index());
+                if expand_regex(&ast, nfa, next_state_index, is_sep)? {
+                    alternative_start_indices.push(nfa.last_state());
+                }
            }
            alternative_start_indices.pop();
            for alternative_start_index in alternative_start_indices {
-                nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
+                nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
            }
-            Ok(())
+            Ok(true)
        }
        Ast::Concat(concat) => {
+            let mut result = false;
            for ast in concat.asts.iter().rev() {
-                expand_regex(&ast, nfa, next_state_index)?;
-                next_state_index = nfa.start_index();
+                if expand_regex(&ast, nfa, next_state_index, is_sep)? {
+                    result = true;
+                }
+                next_state_index = nfa.last_state();
            }
-            Ok(())
+            Ok(result)
        }
    }
 }

-fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
+fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
    match rule {
        Rule::Pattern(s) => {
            let ast = parse::Parser::new()
                .parse(&s)
                .map_err(|e| Error::GrammarError(e.to_string()))?;
-            expand_regex(&ast, nfa, next_state_index)?;
-            Ok(())
+            expand_regex(&ast, nfa, next_state_index, is_sep)
        }
        Rule::String(s) => {
            for c in s.chars().rev() {
-                nfa.prepend(|start_index| {
-                    NfaState::Advance(CharacterSet::empty().add_char(c), start_index)
+                nfa.prepend(|last_state| {
+                    NfaState::Advance {
+                        chars: CharacterSet::empty().add_char(c),
+                        state: last_state,
+                        is_sep,
+                    }
                });
            }
-            Ok(())
+            Ok(s.len() > 0)
        }
        Rule::Choice(elements) => {
            let mut alternative_start_indices = Vec::new();
            for element in elements {
-                expand_rule(element, nfa, next_state_index)?;
-                alternative_start_indices.push(nfa.start_index());
+                if expand_rule(element, nfa, next_state_index, is_sep)? {
+                    alternative_start_indices.push(nfa.last_state());
+                }
            }
            alternative_start_indices.pop();
            for alternative_start_index in alternative_start_indices {
-                nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
+                nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
            }
-            Ok(())
+            Ok(true)
        }
        Rule::Seq(elements) => {
+            let mut result = false;
            for element in elements.into_iter().rev() {
-                expand_rule(element, nfa, next_state_index)?;
-                next_state_index = nfa.start_index();
+                if expand_rule(element, nfa, next_state_index, is_sep)? {
+                    result = true;
+                }
+                next_state_index = nfa.last_state();
            }
-            Ok(())
+            Ok(result)
        }
        Rule::Repeat(rule) => {
-            nfa.states.push(NfaState::Accept); // Placeholder for split
-            let split_index = nfa.start_index();
-            expand_rule(*rule, nfa, split_index)?;
-            nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index);
-            Ok(())
+            nfa.states.push(NfaState::Accept(0)); // Placeholder for split
+            let split_index = nfa.last_state();
+            if expand_rule(rule, nfa, split_index, is_sep)? {
+                nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index);
+                Ok(true)
+            } else {
+                Ok(false)
+            }
        }
-        _ => Err(Error::grammar("Unexpected rule type")),
+        Rule::Blank => Ok(false),
+        _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
    }
 }

 pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
+    let mut nfa = Nfa::new();
+
+    let separator_rule = if grammar.separators.len() > 0 {
+        Rule::repeat(Rule::choice(grammar.separators))
+    } else {
+        Rule::Blank
+    };
+
    let mut variables = Vec::new();
-    for variable in grammar.variables {
-        let mut nfa = Nfa::new();
-        expand_rule(variable.rule, &mut nfa, 0)?;
+    for (i, variable) in grammar.variables.into_iter().enumerate() {
+        let is_immediate_token = match &variable.rule {
+            Rule::Metadata { params, .. } => params.is_main_token,
+            _ => false,
+        };
+
+        nfa.states.push(NfaState::Accept(i));
+        let last_state = nfa.last_state();
+        expand_rule(&variable.rule, &mut nfa, last_state, false)?;
+
+        if !is_immediate_token {
+            let last_state = nfa.last_state();
+            expand_rule(&separator_rule, &mut nfa, last_state, true)?;
+        }
+
        variables.push(LexicalVariable {
            name: variable.name,
            kind: variable.kind,
-            nfa,
+            start_state: nfa.last_state(),
        });
    }
-    let mut separators = Vec::new();
-    for separator in grammar.separators {
-        let mut nfa = Nfa::new();
-        expand_rule(separator, &mut nfa, 0)?;
-        separators.push(nfa);
-    }

-    Ok(LexicalGrammar {
-        variables,
-        separators,
-    })
+    Ok(LexicalGrammar { nfa, variables })
 }

 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::nfa::NfaCursor;
+    use crate::grammars::Variable;

    fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> {
        let mut result = None;
        let mut char_count = 0;
        let mut cursor = NfaCursor::new(nfa);
        for c in s.chars() {
-            if cursor.is_done() {
+            if cursor.finished_ids().count() > 0 {
                result = Some(&s[0..char_count]);
            }
            if cursor.advance(c) {
@ -223,13 +266,13 @@ mod tests {
    #[test]
    fn test_rule_expansion() {
        struct Row {
-            rule: Rule,
+            rules: Vec<Rule>,
            examples: Vec<(&'static str, Option<&'static str>)>,
        }

        let table = [
            Row {
-                rule: Rule::pattern("a|bc"),
+                rules: vec![Rule::pattern("a|bc")],
                examples: vec![
                    ("a12", Some("a")),
                    ("bc12", Some("bc")),
@ -238,7 +281,7 @@ mod tests {
                ],
            },
            Row {
-                rule: Rule::pattern("(a|b|c)d(e|f|g)h?"),
+                rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
                examples: vec![
                    ("ade1", Some("ade")),
                    ("bdf1", Some("bdf")),
@ -247,11 +290,14 @@ mod tests {
                ],
            },
            Row {
-                rule: Rule::pattern("a*"),
-                examples: vec![("aaa1", Some("aaa")), ("b", Some(""))],
+                rules: vec![Rule::pattern("a*")],
+                examples: vec![
+                    ("aaa1", Some("aaa")),
+                    ("b", Some("")),
+                ],
            },
            Row {
-                rule: Rule::pattern("a((bc)+|(de)*)f"),
+                rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
                examples: vec![
                    ("af1", Some("af")),
                    ("adedef1", Some("adedef")),
@ -260,32 +306,51 @@ mod tests {
                ],
            },
            Row {
-                rule: Rule::pattern("[a-fA-F0-9]+"),
-                examples: vec![("A1ff0", Some("A1ff"))],
+                rules: vec![Rule::pattern("[a-fA-F0-9]+")],
+                examples: vec![
+                    ("A1ff0", Some("A1ff")),
+                ],
            },
            Row {
-                rule: Rule::pattern("\\w\\d\\s"),
-                examples: vec![("_0  ", Some("_0 "))],
+                rules: vec![Rule::pattern("\\w\\d\\s")],
+                examples: vec![
+                    ("_0  ", Some("_0 ")),
+                ],
            },
            Row {
-                rule: Rule::string("abc"),
-                examples: vec![("abcd", Some("abc")), ("ab", None)],
+                rules: vec![Rule::string("abc")],
+                examples: vec![
+                    ("abcd", Some("abc")),
+                    ("ab", None)
+                ],
            },
            Row {
-                rule: Rule::repeat(Rule::seq(vec![
-                    Rule::string("{"),
-                    Rule::pattern("[a-f]+"),
-                    Rule::string("}"),
-                ])),
-                examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)],
+                rules: vec![
+                    Rule::repeat(Rule::seq(vec![
+                        Rule::string("{"),
+                        Rule::pattern("[a-f]+"),
+                        Rule::string("}"),
+                    ])),
+                ],
+                examples: vec![
+                    ("{a}{", Some("{a}")),
+                    ("{a}{d", Some("{a}")),
+                    ("ab", None),
+                ],
            },
        ];

-        for Row { rule, examples } in table.iter() {
-            let mut nfa = Nfa::new();
-            expand_rule(rule.clone(), &mut nfa, 0).unwrap();
+        for Row { rules, examples } in &table {
+            let grammar = expand_tokens(ExtractedLexicalGrammar {
+                separators: vec![],
+                variables: rules
+                    .into_iter()
+                    .map(|rule| Variable::named("", rule.clone()))
+                    .collect(),
+            }).unwrap();
+
            for (haystack, needle) in examples.iter() {
-                assert_eq!(simulate_nfa(&nfa, haystack), *needle);
+                assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle);
            }
        }
    }
--- a/src/prepare_grammar/extract_simple_aliases.rs
+++ b/src/prepare_grammar/extract_simple_aliases.rs
@ -130,24 +130,24 @@ mod tests {
        };

        let lexical_grammar = LexicalGrammar {
+            nfa: Nfa::new(),
            variables: vec![
                LexicalVariable {
                    name: "t1".to_string(),
                    kind: VariableType::Anonymous,
-                    nfa: Nfa::new(),
+                    start_state: 0,
                },
                LexicalVariable {
                    name: "t2".to_string(),
                    kind: VariableType::Anonymous,
-                    nfa: Nfa::new(),
+                    start_state: 0,
                },
                LexicalVariable {
                    name: "t3".to_string(),
                    kind: VariableType::Anonymous,
-                    nfa: Nfa::new(),
+                    start_state: 0,
                }
            ],
-            separators: Vec::new(),
        };

        let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
--- a/src/rules.rs
+++ b/src/rules.rs
@ -30,7 +30,6 @@ pub(crate) struct MetadataParams {
    pub is_string: bool,
    pub is_active: bool,
    pub is_main_token: bool,
-    pub is_excluded: bool,
    pub alias: Option<Alias>,
 }