Format expand_tokens file

This commit is contained in:
Max Brunsfeld 2018-12-12 21:01:41 -08:00
parent 842421633c
commit 5fa586f7c9

View file

@ -5,37 +5,98 @@ use crate::nfa::{CharacterSet, Nfa, NfaState};
use crate::rules::Rule;
use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind};
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut nfa = Nfa::new();
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
nfa.states.push(NfaState::Accept(i));
let last_state_id = nfa.last_state_id();
expand_rule(&variable.rule, &mut nfa, last_state_id, false)?;
if !is_immediate_token {
let last_state_id = nfa.last_state_id();
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
start_state: nfa.last_state_id(),
});
}
Ok(LexicalGrammar { nfa, variables })
}
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error::GrammarError(e.to_string()))?;
expand_regex(&ast, nfa, next_state_id, is_sep)
}
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(expand_character_class(&item)?);
Rule::String(s) => {
for c in s.chars().rev() {
nfa.prepend(|last_state_id| NfaState::Advance {
chars: CharacterSet::empty().add_char(c),
state_id: last_state_id,
is_sep,
});
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if expand_rule(element, nfa, next_state_id, is_sep)? {
alternative_state_ids.push(nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if expand_rule(element, nfa, next_state_id, is_sep)? {
result = true;
}
next_state_id = nfa.last_state_id();
}
Ok(result)
}
_ => Err(Error::regex("Unsupported character class syntax")),
Rule::Repeat(rule) => {
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_state_id = nfa.last_state_id();
if expand_rule(rule, nfa, split_state_id, is_sep)? {
nfa.states[split_state_id as usize] =
NfaState::Split(nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
@ -149,107 +210,45 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool)
}
}
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error::GrammarError(e.to_string()))?;
expand_regex(&ast, nfa, next_state_id, is_sep)
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
}
Rule::String(s) => {
for c in s.chars().rev() {
nfa.prepend(|last_state_id| {
NfaState::Advance {
chars: CharacterSet::empty().add_char(c),
state_id: last_state_id,
is_sep,
}
});
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if expand_rule(element, nfa, next_state_id, is_sep)? {
alternative_state_ids.push(nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if expand_rule(element, nfa, next_state_id, is_sep)? {
result = true;
}
next_state_id = nfa.last_state_id();
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(expand_character_class(&item)?);
}
Ok(result)
}
Rule::Repeat(rule) => {
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_state_id = nfa.last_state_id();
if expand_rule(rule, nfa, split_state_id, is_sep)? {
nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
_ => Err(Error::regex("Unsupported character class syntax")),
}
}
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut nfa = Nfa::new();
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
nfa.states.push(NfaState::Accept(i));
let last_state_id = nfa.last_state_id();
expand_rule(&variable.rule, &mut nfa, last_state_id, false)?;
if !is_immediate_token {
let last_state_id = nfa.last_state_id();
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
start_state: nfa.last_state_id(),
});
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
Ok(LexicalGrammar { nfa, variables })
}
#[cfg(test)]
mod tests {
use super::*;
use crate::nfa::NfaCursor;
use crate::grammars::Variable;
use crate::nfa::NfaCursor;
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
@ -299,17 +298,12 @@ mod tests {
("ad1", None),
],
},
// regex with repeats
Row {
rules: vec![Rule::pattern("a*")],
separators: vec![],
examples: vec![
("aaa1", Some((0, "aaa"))),
("b", Some((0, ""))),
],
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
},
// regex with repeats in sequences
Row {
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
@ -321,44 +315,31 @@ mod tests {
("a", None),
],
},
// regex with character ranges
Row {
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
separators: vec![],
examples: vec![
("A1ff0.", Some((0, "A1ff0"))),
],
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
},
// regex with perl character classes
Row {
rules: vec![Rule::pattern("\\w\\d\\s")],
separators: vec![],
examples: vec![
("_0 ", Some((0, "_0 "))),
],
examples: vec![("_0 ", Some((0, "_0 ")))],
},
// string
Row {
rules: vec![Rule::string("abc")],
separators: vec![],
examples: vec![
("abcd", Some((0, "abc"))),
("ab", None)
],
examples: vec![("abcd", Some((0, "abc"))), ("ab", None)],
},
// complex rule containing strings and regexes
Row {
rules: vec![
Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
])),
],
rules: vec![Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
]))],
separators: vec![],
examples: vec![
("{a}{", Some((0, "{a}"))),
@ -366,7 +347,6 @@ mod tests {
("ab", None),
],
},
// longest match rule
Row {
rules: vec![
@ -384,8 +364,7 @@ mod tests {
("c.", None),
],
},
// regexes with alternatives including the empty string
// regex with an alternative including the empty string
Row {
rules: vec![Rule::pattern("a(b|)+c")],
separators: vec![],
@ -395,16 +374,10 @@ mod tests {
("abbc.", Some((0, "abbc"))),
],
},
// separators
Row {
rules: vec![
Rule::pattern("[a-f]+"),
],
separators: vec![
Rule::string("\\\n"),
Rule::pattern("\\s"),
],
rules: vec![Rule::pattern("[a-f]+")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
(" a", Some((0, "a"))),
(" \nb", Some((0, "b"))),
@ -414,14 +387,20 @@ mod tests {
},
];
for Row { rules, separators, examples } in &table {
for Row {
rules,
separators,
examples,
} in &table
{
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: separators.clone(),
variables: rules
.into_iter()
.map(|rule| Variable::named("", rule.clone()))
.collect(),
}).unwrap();
})
.unwrap();
for (haystack, needle) in examples.iter() {
assert_eq!(simulate_nfa(&grammar, haystack), *needle);