diff --git a/src/grammars.rs b/src/grammars.rs index b76a583e..74c213e1 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -35,13 +35,13 @@ pub(crate) struct InputGrammar { pub(crate) struct LexicalVariable { pub name: String, pub kind: VariableType, - pub nfa: Nfa, + pub start_state: u32, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexicalGrammar { + pub nfa: Nfa, pub variables: Vec, - pub separators: Vec, } // Extracted syntax grammar diff --git a/src/nfa.rs b/src/nfa.rs index 22cb2a2e..66861434 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -9,9 +9,13 @@ pub enum CharacterSet { #[derive(Debug, PartialEq, Eq)] pub enum NfaState { - Advance(CharacterSet, u32), + Advance { + chars: CharacterSet, + state: u32, + is_sep: bool, + }, Split(u32, u32), - Accept, + Accept(usize), } #[derive(PartialEq, Eq)] @@ -23,6 +27,7 @@ pub struct Nfa { pub struct NfaCursor<'a> { indices: Vec, nfa: &'a Nfa, + in_sep: bool, } impl CharacterSet { @@ -88,15 +93,15 @@ impl CharacterSet { impl Nfa { pub fn new() -> Self { - Nfa { states: vec![NfaState::Accept] } + Nfa { states: Vec::new() } } - pub fn start_index(&self) -> u32 { + pub fn last_state(&self) -> u32 { self.states.len() as u32 - 1 } pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) { - self.states.push(f(self.start_index())); + self.states.push(f(self.last_state())); } } @@ -116,38 +121,45 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa) -> Self { - let mut result = Self { nfa, indices: Vec::new() }; - result.add_indices(&mut vec![nfa.start_index()]); + let mut result = Self { nfa, indices: Vec::new(), in_sep: true }; + result.add_states(&mut vec![nfa.last_state()]); result } pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_indices = Vec::new(); + let mut any_sep_transitions = false; for index in &self.indices { - if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] { + if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] { + if *is_sep { + any_sep_transitions = true; + } if chars.contains(c) { - new_indices.push(*next_index); + new_indices.push(*state); result = true; } } } + if !any_sep_transitions { + self.in_sep = false; + } self.indices.clear(); - self.add_indices(&mut new_indices); + self.add_states(&mut new_indices); result } - pub fn is_done(&self) -> bool { - self.indices.iter().any(|index| { - if let NfaState::Accept = self.nfa.states[*index as usize] { - true + pub fn finished_ids<'b>(&'b self) -> impl Iterator + 'b { + self.indices.iter().filter_map(move |index| { + if let NfaState::Accept(i) = self.nfa.states[*index as usize] { + Some(i) } else { - false + None } }) } - pub fn add_indices(&mut self, new_indices: &mut Vec) { + pub fn add_states(&mut self, new_indices: &mut Vec) { while let Some(index) = new_indices.pop() { let state = &self.nfa.states[index as usize]; if let NfaState::Split(left, right) = state { diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index e0e1f9a9..3019b2be 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result { } } -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match ast { - Ast::Empty(_) => Ok(()), + Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Include(vec![literal.c]), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Include(vec![literal.c]), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Dot(_) => { - nfa.states.push(NfaState::Advance( - CharacterSet::Exclude(vec!['\n']), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: CharacterSet::Exclude(vec!['\n']), + state: next_state_index, + is_sep, + }); + Ok(true) } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), Ast::Class(class) => match class { Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), Class::Perl(class) => { - nfa.states.push(NfaState::Advance( - expand_perl_character_class(&class.kind), - next_state_index, - )); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: expand_perl_character_class(&class.kind), + state: next_state_index, + is_sep, + }); + Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { let character_set = expand_character_class(&item)?; - nfa.states - .push(NfaState::Advance(character_set, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Advance { + chars: character_set, + state: next_state_index, + is_sep, + }); + Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( "Binary operators in character classes aren't supported", @@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<( }, Ast::Repetition(repetition) => match repetition.op.kind { RepetitionKind::ZeroOrOne => { - expand_regex(&repetition.ast, nfa, next_state_index)?; - nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index)); - Ok(()) + if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? { + nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::OneOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + nfa.states.pop(); + Ok(false) + } } RepetitionKind::ZeroOrMore => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_regex(&repetition.ast, nfa, split_index)?; - nfa.states[split_index as usize] = - NfaState::Split(nfa.start_index(), next_state_index); - nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index)); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_regex(&repetition.ast, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = + NfaState::Split(nfa.last_state(), next_state_index); + nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index)); + Ok(true) + } else { + Ok(false) + } } RepetitionKind::Range(_) => unimplemented!(), }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()), + Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep), Ast::Alternation(alternation) => { let mut alternative_start_indices = Vec::new(); for ast in alternation.asts.iter() { - expand_regex(&ast, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Ast::Concat(concat) => { + let mut result = false; for ast in concat.asts.iter().rev() { - expand_regex(&ast, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_regex(&ast, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } } } -fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> { +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result { match rule { Rule::Pattern(s) => { let ast = parse::Parser::new() .parse(&s) .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_index)?; - Ok(()) + expand_regex(&ast, nfa, next_state_index, is_sep) } Rule::String(s) => { for c in s.chars().rev() { - nfa.prepend(|start_index| { - NfaState::Advance(CharacterSet::empty().add_char(c), start_index) + nfa.prepend(|last_state| { + NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state: last_state, + is_sep, + } }); } - Ok(()) + Ok(s.len() > 0) } Rule::Choice(elements) => { let mut alternative_start_indices = Vec::new(); for element in elements { - expand_rule(element, nfa, next_state_index)?; - alternative_start_indices.push(nfa.start_index()); + if expand_rule(element, nfa, next_state_index, is_sep)? { + alternative_start_indices.push(nfa.last_state()); + } } alternative_start_indices.pop(); for alternative_start_index in alternative_start_indices { - nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index)); + nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index)); } - Ok(()) + Ok(true) } Rule::Seq(elements) => { + let mut result = false; for element in elements.into_iter().rev() { - expand_rule(element, nfa, next_state_index)?; - next_state_index = nfa.start_index(); + if expand_rule(element, nfa, next_state_index, is_sep)? { + result = true; + } + next_state_index = nfa.last_state(); } - Ok(()) + Ok(result) } Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept); // Placeholder for split - let split_index = nfa.start_index(); - expand_rule(*rule, nfa, split_index)?; - nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index); - Ok(()) + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_index = nfa.last_state(); + if expand_rule(rule, nfa, split_index, is_sep)? { + nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index); + Ok(true) + } else { + Ok(false) + } } - _ => Err(Error::grammar("Unexpected rule type")), + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + let mut variables = Vec::new(); - for variable in grammar.variables { - let mut nfa = Nfa::new(); - expand_rule(variable.rule, &mut nfa, 0)?; + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state = nfa.last_state(); + expand_rule(&variable.rule, &mut nfa, last_state, false)?; + + if !is_immediate_token { + let last_state = nfa.last_state(); + expand_rule(&separator_rule, &mut nfa, last_state, true)?; + } + variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - nfa, + start_state: nfa.last_state(), }); } - let mut separators = Vec::new(); - for separator in grammar.separators { - let mut nfa = Nfa::new(); - expand_rule(separator, &mut nfa, 0)?; - separators.push(nfa); - } - Ok(LexicalGrammar { - variables, - separators, - }) + Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; use crate::nfa::NfaCursor; + use crate::grammars::Variable; fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> { let mut result = None; let mut char_count = 0; let mut cursor = NfaCursor::new(nfa); for c in s.chars() { - if cursor.is_done() { + if cursor.finished_ids().count() > 0 { result = Some(&s[0..char_count]); } if cursor.advance(c) { @@ -223,13 +266,13 @@ mod tests { #[test] fn test_rule_expansion() { struct Row { - rule: Rule, + rules: Vec, examples: Vec<(&'static str, Option<&'static str>)>, } let table = [ Row { - rule: Rule::pattern("a|bc"), + rules: vec![Rule::pattern("a|bc")], examples: vec![ ("a12", Some("a")), ("bc12", Some("bc")), @@ -238,7 +281,7 @@ mod tests { ], }, Row { - rule: Rule::pattern("(a|b|c)d(e|f|g)h?"), + rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], examples: vec![ ("ade1", Some("ade")), ("bdf1", Some("bdf")), @@ -247,11 +290,14 @@ mod tests { ], }, Row { - rule: Rule::pattern("a*"), - examples: vec![("aaa1", Some("aaa")), ("b", Some(""))], + rules: vec![Rule::pattern("a*")], + examples: vec![ + ("aaa1", Some("aaa")), + ("b", Some("")), + ], }, Row { - rule: Rule::pattern("a((bc)+|(de)*)f"), + rules: vec![Rule::pattern("a((bc)+|(de)*)f")], examples: vec![ ("af1", Some("af")), ("adedef1", Some("adedef")), @@ -260,32 +306,51 @@ mod tests { ], }, Row { - rule: Rule::pattern("[a-fA-F0-9]+"), - examples: vec![("A1ff0", Some("A1ff"))], + rules: vec![Rule::pattern("[a-fA-F0-9]+")], + examples: vec![ + ("A1ff0", Some("A1ff")), + ], }, Row { - rule: Rule::pattern("\\w\\d\\s"), - examples: vec![("_0 ", Some("_0 "))], + rules: vec![Rule::pattern("\\w\\d\\s")], + examples: vec![ + ("_0 ", Some("_0 ")), + ], }, Row { - rule: Rule::string("abc"), - examples: vec![("abcd", Some("abc")), ("ab", None)], + rules: vec![Rule::string("abc")], + examples: vec![ + ("abcd", Some("abc")), + ("ab", None) + ], }, Row { - rule: Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)], + rules: vec![ + Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ])), + ], + examples: vec![ + ("{a}{", Some("{a}")), + ("{a}{d", Some("{a}")), + ("ab", None), + ], }, ]; - for Row { rule, examples } in table.iter() { - let mut nfa = Nfa::new(); - expand_rule(rule.clone(), &mut nfa, 0).unwrap(); + for Row { rules, examples } in &table { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: vec![], + variables: rules + .into_iter() + .map(|rule| Variable::named("", rule.clone())) + .collect(), + }).unwrap(); + for (haystack, needle) in examples.iter() { - assert_eq!(simulate_nfa(&nfa, haystack), *needle); + assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle); } } } diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index a10c7982..8b87ea2e 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -130,24 +130,24 @@ mod tests { }; let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), variables: vec![ LexicalVariable { name: "t1".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t2".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, }, LexicalVariable { name: "t3".to_string(), kind: VariableType::Anonymous, - nfa: Nfa::new(), + start_state: 0, } ], - separators: Vec::new(), }; let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); diff --git a/src/rules.rs b/src/rules.rs index 5d0af86c..d7234f45 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -30,7 +30,6 @@ pub(crate) struct MetadataParams { pub is_string: bool, pub is_active: bool, pub is_main_token: bool, - pub is_excluded: bool, pub alias: Option, }