Integrate separator rules into lexer nfa

This commit is contained in:
Max Brunsfeld 2018-12-12 18:04:29 -08:00
parent 40d24097ec
commit 0103a83f3f
5 changed files with 199 additions and 123 deletions

View file

@ -35,13 +35,13 @@ pub(crate) struct InputGrammar {
pub(crate) struct LexicalVariable {
pub name: String,
pub kind: VariableType,
pub nfa: Nfa,
pub start_state: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexicalGrammar {
pub nfa: Nfa,
pub variables: Vec<LexicalVariable>,
pub separators: Vec<Nfa>,
}
// Extracted syntax grammar

View file

@ -9,9 +9,13 @@ pub enum CharacterSet {
#[derive(Debug, PartialEq, Eq)]
pub enum NfaState {
Advance(CharacterSet, u32),
Advance {
chars: CharacterSet,
state: u32,
is_sep: bool,
},
Split(u32, u32),
Accept,
Accept(usize),
}
#[derive(PartialEq, Eq)]
@ -23,6 +27,7 @@ pub struct Nfa {
pub struct NfaCursor<'a> {
indices: Vec<u32>,
nfa: &'a Nfa,
in_sep: bool,
}
impl CharacterSet {
@ -88,15 +93,15 @@ impl CharacterSet {
impl Nfa {
pub fn new() -> Self {
Nfa { states: vec![NfaState::Accept] }
Nfa { states: Vec::new() }
}
pub fn start_index(&self) -> u32 {
pub fn last_state(&self) -> u32 {
self.states.len() as u32 - 1
}
pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
self.states.push(f(self.start_index()));
self.states.push(f(self.last_state()));
}
}
@ -116,38 +121,45 @@ impl fmt::Debug for Nfa {
impl<'a> NfaCursor<'a> {
pub fn new(nfa: &'a Nfa) -> Self {
let mut result = Self { nfa, indices: Vec::new() };
result.add_indices(&mut vec![nfa.start_index()]);
let mut result = Self { nfa, indices: Vec::new(), in_sep: true };
result.add_states(&mut vec![nfa.last_state()]);
result
}
pub fn advance(&mut self, c: char) -> bool {
let mut result = false;
let mut new_indices = Vec::new();
let mut any_sep_transitions = false;
for index in &self.indices {
if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] {
if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] {
if *is_sep {
any_sep_transitions = true;
}
if chars.contains(c) {
new_indices.push(*next_index);
new_indices.push(*state);
result = true;
}
}
}
if !any_sep_transitions {
self.in_sep = false;
}
self.indices.clear();
self.add_indices(&mut new_indices);
self.add_states(&mut new_indices);
result
}
pub fn is_done(&self) -> bool {
self.indices.iter().any(|index| {
if let NfaState::Accept = self.nfa.states[*index as usize] {
true
pub fn finished_ids<'b>(&'b self) -> impl Iterator<Item = usize> + 'b {
self.indices.iter().filter_map(move |index| {
if let NfaState::Accept(i) = self.nfa.states[*index as usize] {
Some(i)
} else {
false
None
}
})
}
pub fn add_indices(&mut self, new_indices: &mut Vec<u32>) {
pub fn add_states(&mut self, new_indices: &mut Vec<u32>) {
while let Some(index) = new_indices.pop() {
let state = &self.nfa.states[index as usize];
if let NfaState::Split(left, right) = state {

View file

@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
}
}
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(()),
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Literal(literal) => {
nfa.states.push(NfaState::Advance(
CharacterSet::Include(vec![literal.c]),
next_state_index,
));
Ok(())
nfa.states.push(NfaState::Advance {
chars: CharacterSet::Include(vec![literal.c]),
state: next_state_index,
is_sep,
});
Ok(true)
}
Ast::Dot(_) => {
nfa.states.push(NfaState::Advance(
CharacterSet::Exclude(vec!['\n']),
next_state_index,
));
Ok(())
nfa.states.push(NfaState::Advance {
chars: CharacterSet::Exclude(vec!['\n']),
state: next_state_index,
is_sep,
});
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
Class::Perl(class) => {
nfa.states.push(NfaState::Advance(
expand_perl_character_class(&class.kind),
next_state_index,
));
Ok(())
nfa.states.push(NfaState::Advance {
chars: expand_perl_character_class(&class.kind),
state: next_state_index,
is_sep,
});
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let character_set = expand_character_class(&item)?;
nfa.states
.push(NfaState::Advance(character_set, next_state_index));
Ok(())
nfa.states.push(NfaState::Advance {
chars: character_set,
state: next_state_index,
is_sep,
});
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<(
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
expand_regex(&repetition.ast, nfa, next_state_index)?;
nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index));
Ok(())
if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? {
nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state));
Ok(true)
} else {
Ok(false)
}
}
RepetitionKind::OneOrMore => {
nfa.states.push(NfaState::Accept); // Placeholder for split
let split_index = nfa.start_index();
expand_regex(&repetition.ast, nfa, split_index)?;
nfa.states[split_index as usize] =
NfaState::Split(nfa.start_index(), next_state_index);
Ok(())
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_index = nfa.last_state();
if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
nfa.states[split_index as usize] =
NfaState::Split(nfa.last_state(), next_state_index);
Ok(true)
} else {
nfa.states.pop();
Ok(false)
}
}
RepetitionKind::ZeroOrMore => {
nfa.states.push(NfaState::Accept); // Placeholder for split
let split_index = nfa.start_index();
expand_regex(&repetition.ast, nfa, split_index)?;
nfa.states[split_index as usize] =
NfaState::Split(nfa.start_index(), next_state_index);
nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index));
Ok(())
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_index = nfa.last_state();
if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
nfa.states[split_index as usize] =
NfaState::Split(nfa.last_state(), next_state_index);
nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index));
Ok(true)
} else {
Ok(false)
}
}
RepetitionKind::Range(_) => unimplemented!(),
},
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()),
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep),
Ast::Alternation(alternation) => {
let mut alternative_start_indices = Vec::new();
for ast in alternation.asts.iter() {
expand_regex(&ast, nfa, next_state_index)?;
alternative_start_indices.push(nfa.start_index());
if expand_regex(&ast, nfa, next_state_index, is_sep)? {
alternative_start_indices.push(nfa.last_state());
}
}
alternative_start_indices.pop();
for alternative_start_index in alternative_start_indices {
nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
}
Ok(())
Ok(true)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
expand_regex(&ast, nfa, next_state_index)?;
next_state_index = nfa.start_index();
if expand_regex(&ast, nfa, next_state_index, is_sep)? {
result = true;
}
next_state_index = nfa.last_state();
}
Ok(())
Ok(result)
}
}
}
fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error::GrammarError(e.to_string()))?;
expand_regex(&ast, nfa, next_state_index)?;
Ok(())
expand_regex(&ast, nfa, next_state_index, is_sep)
}
Rule::String(s) => {
for c in s.chars().rev() {
nfa.prepend(|start_index| {
NfaState::Advance(CharacterSet::empty().add_char(c), start_index)
nfa.prepend(|last_state| {
NfaState::Advance {
chars: CharacterSet::empty().add_char(c),
state: last_state,
is_sep,
}
});
}
Ok(())
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_start_indices = Vec::new();
for element in elements {
expand_rule(element, nfa, next_state_index)?;
alternative_start_indices.push(nfa.start_index());
if expand_rule(element, nfa, next_state_index, is_sep)? {
alternative_start_indices.push(nfa.last_state());
}
}
alternative_start_indices.pop();
for alternative_start_index in alternative_start_indices {
nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
}
Ok(())
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
expand_rule(element, nfa, next_state_index)?;
next_state_index = nfa.start_index();
if expand_rule(element, nfa, next_state_index, is_sep)? {
result = true;
}
next_state_index = nfa.last_state();
}
Ok(())
Ok(result)
}
Rule::Repeat(rule) => {
nfa.states.push(NfaState::Accept); // Placeholder for split
let split_index = nfa.start_index();
expand_rule(*rule, nfa, split_index)?;
nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index);
Ok(())
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_index = nfa.last_state();
if expand_rule(rule, nfa, split_index, is_sep)? {
nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index);
Ok(true)
} else {
Ok(false)
}
}
_ => Err(Error::grammar("Unexpected rule type")),
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut nfa = Nfa::new();
let separator_rule = if grammar.separators.len() > 0 {
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for variable in grammar.variables {
let mut nfa = Nfa::new();
expand_rule(variable.rule, &mut nfa, 0)?;
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
nfa.states.push(NfaState::Accept(i));
let last_state = nfa.last_state();
expand_rule(&variable.rule, &mut nfa, last_state, false)?;
if !is_immediate_token {
let last_state = nfa.last_state();
expand_rule(&separator_rule, &mut nfa, last_state, true)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
nfa,
start_state: nfa.last_state(),
});
}
let mut separators = Vec::new();
for separator in grammar.separators {
let mut nfa = Nfa::new();
expand_rule(separator, &mut nfa, 0)?;
separators.push(nfa);
}
Ok(LexicalGrammar {
variables,
separators,
})
Ok(LexicalGrammar { nfa, variables })
}
#[cfg(test)]
mod tests {
use super::*;
use crate::nfa::NfaCursor;
use crate::grammars::Variable;
fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> {
let mut result = None;
let mut char_count = 0;
let mut cursor = NfaCursor::new(nfa);
for c in s.chars() {
if cursor.is_done() {
if cursor.finished_ids().count() > 0 {
result = Some(&s[0..char_count]);
}
if cursor.advance(c) {
@ -223,13 +266,13 @@ mod tests {
#[test]
fn test_rule_expansion() {
struct Row {
rule: Rule,
rules: Vec<Rule>,
examples: Vec<(&'static str, Option<&'static str>)>,
}
let table = [
Row {
rule: Rule::pattern("a|bc"),
rules: vec![Rule::pattern("a|bc")],
examples: vec![
("a12", Some("a")),
("bc12", Some("bc")),
@ -238,7 +281,7 @@ mod tests {
],
},
Row {
rule: Rule::pattern("(a|b|c)d(e|f|g)h?"),
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
examples: vec![
("ade1", Some("ade")),
("bdf1", Some("bdf")),
@ -247,11 +290,14 @@ mod tests {
],
},
Row {
rule: Rule::pattern("a*"),
examples: vec![("aaa1", Some("aaa")), ("b", Some(""))],
rules: vec![Rule::pattern("a*")],
examples: vec![
("aaa1", Some("aaa")),
("b", Some("")),
],
},
Row {
rule: Rule::pattern("a((bc)+|(de)*)f"),
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
examples: vec![
("af1", Some("af")),
("adedef1", Some("adedef")),
@ -260,32 +306,51 @@ mod tests {
],
},
Row {
rule: Rule::pattern("[a-fA-F0-9]+"),
examples: vec![("A1ff0", Some("A1ff"))],
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
examples: vec![
("A1ff0", Some("A1ff")),
],
},
Row {
rule: Rule::pattern("\\w\\d\\s"),
examples: vec![("_0 ", Some("_0 "))],
rules: vec![Rule::pattern("\\w\\d\\s")],
examples: vec![
("_0 ", Some("_0 ")),
],
},
Row {
rule: Rule::string("abc"),
examples: vec![("abcd", Some("abc")), ("ab", None)],
rules: vec![Rule::string("abc")],
examples: vec![
("abcd", Some("abc")),
("ab", None)
],
},
Row {
rule: Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
])),
examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)],
rules: vec![
Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
])),
],
examples: vec![
("{a}{", Some("{a}")),
("{a}{d", Some("{a}")),
("ab", None),
],
},
];
for Row { rule, examples } in table.iter() {
let mut nfa = Nfa::new();
expand_rule(rule.clone(), &mut nfa, 0).unwrap();
for Row { rules, examples } in &table {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: vec![],
variables: rules
.into_iter()
.map(|rule| Variable::named("", rule.clone()))
.collect(),
}).unwrap();
for (haystack, needle) in examples.iter() {
assert_eq!(simulate_nfa(&nfa, haystack), *needle);
assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle);
}
}
}

View file

@ -130,24 +130,24 @@ mod tests {
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
nfa: Nfa::new(),
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
nfa: Nfa::new(),
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
nfa: Nfa::new(),
start_state: 0,
}
],
separators: Vec::new(),
};
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);

View file

@ -30,7 +30,6 @@ pub(crate) struct MetadataParams {
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool,
pub is_excluded: bool,
pub alias: Option<Alias>,
}