From 5fa586f7c92916db288e258c91a0424e3af04f30 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 12 Dec 2018 21:01:41 -0800 Subject: [PATCH] Format expand_tokens file --- src/prepare_grammar/expand_tokens.rs | 281 +++++++++++++-------------- 1 file changed, 130 insertions(+), 151 deletions(-) diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 8b8cd03a..7a1d2f4d 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -5,37 +5,98 @@ use crate::nfa::{CharacterSet, Nfa, NfaState}; use crate::rules::Rule; use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind}; -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), +pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut nfa = Nfa::new(); + + let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + + let mut variables = Vec::new(); + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + nfa.states.push(NfaState::Accept(i)); + let last_state_id = nfa.last_state_id(); + expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; + + if !is_immediate_token { + let last_state_id = nfa.last_state_id(); + expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + } + + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + start_state: nfa.last_state_id(), + }); } + + Ok(LexicalGrammar { nfa, variables }) } -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) +fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + expand_regex(&ast, nfa, next_state_id, is_sep) } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); + Rule::String(s) => { + for c in s.chars().rev() { + nfa.prepend(|last_state_id| NfaState::Advance { + chars: CharacterSet::empty().add_char(c), + state_id: last_state_id, + is_sep, + }); + } + Ok(s.len() > 0) + } + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if expand_rule(element, nfa, next_state_id, is_sep)? { + alternative_state_ids.push(nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.retain(|i| *i != nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); + } + Ok(true) + } + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if expand_rule(element, nfa, next_state_id, is_sep)? { + result = true; + } + next_state_id = nfa.last_state_id(); } Ok(result) } - _ => Err(Error::regex("Unsupported character class syntax")), + Rule::Repeat(rule) => { + nfa.states.push(NfaState::Accept(0)); // Placeholder for split + let split_state_id = nfa.last_state_id(); + if expand_rule(rule, nfa, split_state_id, is_sep)? { + nfa.states[split_state_id as usize] = + NfaState::Split(nfa.last_state_id(), next_state_id); + Ok(true) + } else { + Ok(false) + } + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), } } @@ -149,107 +210,45 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) } } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) +fn expand_character_class(item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| { - NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - } - }); - } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; - } - next_state_id = nfa.last_state_id(); + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(expand_character_class(&item)?); } Ok(result) } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + _ => Err(Error::regex("Unsupported character class syntax")), } } -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); - - let separator_rule = if grammar.separators.len() > 0 { - grammar.separators.push(Rule::Blank); - Rule::repeat(Rule::choice(grammar.separators)) - } else { - Rule::Blank - }; - - let mut variables = Vec::new(); - for (i, variable) in grammar.variables.into_iter().enumerate() { - let is_immediate_token = match &variable.rule { - Rule::Metadata { params, .. } => params.is_main_token, - _ => false, - }; - - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false)?; - - if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; - } - - variables.push(LexicalVariable { - name: variable.name, - kind: variable.kind, - start_state: nfa.last_state_id(), - }); +fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - - Ok(LexicalGrammar { nfa, variables }) } #[cfg(test)] mod tests { use super::*; - use crate::nfa::NfaCursor; use crate::grammars::Variable; + use crate::nfa::NfaCursor; fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); @@ -299,17 +298,12 @@ mod tests { ("ad1", None), ], }, - // regex with repeats Row { rules: vec![Rule::pattern("a*")], separators: vec![], - examples: vec![ - ("aaa1", Some((0, "aaa"))), - ("b", Some((0, ""))), - ], + examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))], }, - // regex with repeats in sequences Row { rules: vec![Rule::pattern("a((bc)+|(de)*)f")], @@ -321,44 +315,31 @@ mod tests { ("a", None), ], }, - // regex with character ranges Row { rules: vec![Rule::pattern("[a-fA-F0-9]+")], separators: vec![], - examples: vec![ - ("A1ff0.", Some((0, "A1ff0"))), - ], + examples: vec![("A1ff0.", Some((0, "A1ff0")))], }, - // regex with perl character classes Row { rules: vec![Rule::pattern("\\w\\d\\s")], separators: vec![], - examples: vec![ - ("_0 ", Some((0, "_0 "))), - ], + examples: vec![("_0 ", Some((0, "_0 ")))], }, - // string Row { rules: vec![Rule::string("abc")], separators: vec![], - examples: vec![ - ("abcd", Some((0, "abc"))), - ("ab", None) - ], + examples: vec![("abcd", Some((0, "abc"))), ("ab", None)], }, - // complex rule containing strings and regexes Row { - rules: vec![ - Rule::repeat(Rule::seq(vec![ - Rule::string("{"), - Rule::pattern("[a-f]+"), - Rule::string("}"), - ])), - ], + rules: vec![Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ]))], separators: vec![], examples: vec![ ("{a}{", Some((0, "{a}"))), @@ -366,7 +347,6 @@ mod tests { ("ab", None), ], }, - // longest match rule Row { rules: vec![ @@ -384,8 +364,7 @@ mod tests { ("c.", None), ], }, - - // regexes with alternatives including the empty string + // regex with an alternative including the empty string Row { rules: vec![Rule::pattern("a(b|)+c")], separators: vec![], @@ -395,16 +374,10 @@ mod tests { ("abbc.", Some((0, "abbc"))), ], }, - // separators Row { - rules: vec![ - Rule::pattern("[a-f]+"), - ], - separators: vec![ - Rule::string("\\\n"), - Rule::pattern("\\s"), - ], + rules: vec![Rule::pattern("[a-f]+")], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], examples: vec![ (" a", Some((0, "a"))), (" \nb", Some((0, "b"))), @@ -414,14 +387,20 @@ mod tests { }, ]; - for Row { rules, separators, examples } in &table { + for Row { + rules, + separators, + examples, + } in &table + { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: separators.clone(), variables: rules .into_iter() .map(|rule| Variable::named("", rule.clone())) .collect(), - }).unwrap(); + }) + .unwrap(); for (haystack, needle) in examples.iter() { assert_eq!(simulate_nfa(&grammar, haystack), *needle);