Format expand_tokens file
This commit is contained in:
parent
842421633c
commit
5fa586f7c9
1 changed files with 130 additions and 151 deletions
|
|
@ -5,37 +5,98 @@ use crate::nfa::{CharacterSet, Nfa, NfaState};
|
|||
use crate::rules::Rule;
|
||||
use regex_syntax::ast::{parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind};
|
||||
|
||||
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
|
||||
ClassPerlKind::Space => CharacterSet::empty()
|
||||
.add_char(' ')
|
||||
.add_char('\t')
|
||||
.add_char('\r')
|
||||
.add_char('\n'),
|
||||
ClassPerlKind::Word => CharacterSet::empty()
|
||||
.add_char('_')
|
||||
.add_range('A', 'Z')
|
||||
.add_range('a', 'z')
|
||||
.add_range('0', '9'),
|
||||
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut nfa = Nfa::new();
|
||||
|
||||
let separator_rule = if grammar.separators.len() > 0 {
|
||||
grammar.separators.push(Rule::Blank);
|
||||
Rule::repeat(Rule::choice(grammar.separators))
|
||||
} else {
|
||||
Rule::Blank
|
||||
};
|
||||
|
||||
let mut variables = Vec::new();
|
||||
for (i, variable) in grammar.variables.into_iter().enumerate() {
|
||||
let is_immediate_token = match &variable.rule {
|
||||
Rule::Metadata { params, .. } => params.is_main_token,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
nfa.states.push(NfaState::Accept(i));
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&variable.rule, &mut nfa, last_state_id, false)?;
|
||||
|
||||
if !is_immediate_token {
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
|
||||
}
|
||||
|
||||
variables.push(LexicalVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
start_state: nfa.last_state_id(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(LexicalGrammar { nfa, variables })
|
||||
}
|
||||
|
||||
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
|
||||
match item {
|
||||
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
|
||||
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
|
||||
ClassSetItem::Range(range) => {
|
||||
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
|
||||
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new()
|
||||
.parse(&s)
|
||||
.map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
expand_regex(&ast, nfa, next_state_id, is_sep)
|
||||
}
|
||||
ClassSetItem::Union(union) => {
|
||||
let mut result = CharacterSet::empty();
|
||||
for item in &union.items {
|
||||
result = result.add(expand_character_class(&item)?);
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
nfa.prepend(|last_state_id| NfaState::Advance {
|
||||
chars: CharacterSet::empty().add_char(c),
|
||||
state_id: last_state_id,
|
||||
is_sep,
|
||||
});
|
||||
}
|
||||
Ok(s.len() > 0)
|
||||
}
|
||||
Rule::Choice(elements) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for element in elements {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
alternative_state_ids.push(nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
}
|
||||
}
|
||||
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
Rule::Seq(elements) => {
|
||||
let mut result = false;
|
||||
for element in elements.into_iter().rev() {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
result = true;
|
||||
}
|
||||
next_state_id = nfa.last_state_id();
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
_ => Err(Error::regex("Unsupported character class syntax")),
|
||||
Rule::Repeat(rule) => {
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_state_id = nfa.last_state_id();
|
||||
if expand_rule(rule, nfa, split_state_id, is_sep)? {
|
||||
nfa.states[split_state_id as usize] =
|
||||
NfaState::Split(nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
Rule::Blank => Ok(false),
|
||||
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -149,107 +210,45 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool)
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new()
|
||||
.parse(&s)
|
||||
.map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
expand_regex(&ast, nfa, next_state_id, is_sep)
|
||||
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
|
||||
match item {
|
||||
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
|
||||
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
|
||||
ClassSetItem::Range(range) => {
|
||||
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
|
||||
}
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
nfa.prepend(|last_state_id| {
|
||||
NfaState::Advance {
|
||||
chars: CharacterSet::empty().add_char(c),
|
||||
state_id: last_state_id,
|
||||
is_sep,
|
||||
}
|
||||
});
|
||||
}
|
||||
Ok(s.len() > 0)
|
||||
}
|
||||
Rule::Choice(elements) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for element in elements {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
alternative_state_ids.push(nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
}
|
||||
}
|
||||
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
Rule::Seq(elements) => {
|
||||
let mut result = false;
|
||||
for element in elements.into_iter().rev() {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
result = true;
|
||||
}
|
||||
next_state_id = nfa.last_state_id();
|
||||
ClassSetItem::Union(union) => {
|
||||
let mut result = CharacterSet::empty();
|
||||
for item in &union.items {
|
||||
result = result.add(expand_character_class(&item)?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
Rule::Repeat(rule) => {
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_state_id = nfa.last_state_id();
|
||||
if expand_rule(rule, nfa, split_state_id, is_sep)? {
|
||||
nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
Rule::Blank => Ok(false),
|
||||
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
|
||||
_ => Err(Error::regex("Unsupported character class syntax")),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut nfa = Nfa::new();
|
||||
|
||||
let separator_rule = if grammar.separators.len() > 0 {
|
||||
grammar.separators.push(Rule::Blank);
|
||||
Rule::repeat(Rule::choice(grammar.separators))
|
||||
} else {
|
||||
Rule::Blank
|
||||
};
|
||||
|
||||
let mut variables = Vec::new();
|
||||
for (i, variable) in grammar.variables.into_iter().enumerate() {
|
||||
let is_immediate_token = match &variable.rule {
|
||||
Rule::Metadata { params, .. } => params.is_main_token,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
nfa.states.push(NfaState::Accept(i));
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&variable.rule, &mut nfa, last_state_id, false)?;
|
||||
|
||||
if !is_immediate_token {
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
|
||||
}
|
||||
|
||||
variables.push(LexicalVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
start_state: nfa.last_state_id(),
|
||||
});
|
||||
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
|
||||
ClassPerlKind::Space => CharacterSet::empty()
|
||||
.add_char(' ')
|
||||
.add_char('\t')
|
||||
.add_char('\r')
|
||||
.add_char('\n'),
|
||||
ClassPerlKind::Word => CharacterSet::empty()
|
||||
.add_char('_')
|
||||
.add_range('A', 'Z')
|
||||
.add_range('a', 'z')
|
||||
.add_range('0', '9'),
|
||||
}
|
||||
|
||||
Ok(LexicalGrammar { nfa, variables })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::nfa::NfaCursor;
|
||||
use crate::grammars::Variable;
|
||||
use crate::nfa::NfaCursor;
|
||||
|
||||
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
|
||||
let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
|
||||
|
|
@ -299,17 +298,12 @@ mod tests {
|
|||
("ad1", None),
|
||||
],
|
||||
},
|
||||
|
||||
// regex with repeats
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a*")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("aaa1", Some((0, "aaa"))),
|
||||
("b", Some((0, ""))),
|
||||
],
|
||||
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
|
||||
},
|
||||
|
||||
// regex with repeats in sequences
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
|
||||
|
|
@ -321,44 +315,31 @@ mod tests {
|
|||
("a", None),
|
||||
],
|
||||
},
|
||||
|
||||
// regex with character ranges
|
||||
Row {
|
||||
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("A1ff0.", Some((0, "A1ff0"))),
|
||||
],
|
||||
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
|
||||
},
|
||||
|
||||
// regex with perl character classes
|
||||
Row {
|
||||
rules: vec![Rule::pattern("\\w\\d\\s")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("_0 ", Some((0, "_0 "))),
|
||||
],
|
||||
examples: vec![("_0 ", Some((0, "_0 ")))],
|
||||
},
|
||||
|
||||
// string
|
||||
Row {
|
||||
rules: vec![Rule::string("abc")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("abcd", Some((0, "abc"))),
|
||||
("ab", None)
|
||||
],
|
||||
examples: vec![("abcd", Some((0, "abc"))), ("ab", None)],
|
||||
},
|
||||
|
||||
// complex rule containing strings and regexes
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("{"),
|
||||
Rule::pattern("[a-f]+"),
|
||||
Rule::string("}"),
|
||||
])),
|
||||
],
|
||||
rules: vec![Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("{"),
|
||||
Rule::pattern("[a-f]+"),
|
||||
Rule::string("}"),
|
||||
]))],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("{a}{", Some((0, "{a}"))),
|
||||
|
|
@ -366,7 +347,6 @@ mod tests {
|
|||
("ab", None),
|
||||
],
|
||||
},
|
||||
|
||||
// longest match rule
|
||||
Row {
|
||||
rules: vec![
|
||||
|
|
@ -384,8 +364,7 @@ mod tests {
|
|||
("c.", None),
|
||||
],
|
||||
},
|
||||
|
||||
// regexes with alternatives including the empty string
|
||||
// regex with an alternative including the empty string
|
||||
Row {
|
||||
rules: vec![Rule::pattern("a(b|)+c")],
|
||||
separators: vec![],
|
||||
|
|
@ -395,16 +374,10 @@ mod tests {
|
|||
("abbc.", Some((0, "abbc"))),
|
||||
],
|
||||
},
|
||||
|
||||
// separators
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::pattern("[a-f]+"),
|
||||
],
|
||||
separators: vec![
|
||||
Rule::string("\\\n"),
|
||||
Rule::pattern("\\s"),
|
||||
],
|
||||
rules: vec![Rule::pattern("[a-f]+")],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
|
||||
examples: vec![
|
||||
(" a", Some((0, "a"))),
|
||||
(" \nb", Some((0, "b"))),
|
||||
|
|
@ -414,14 +387,20 @@ mod tests {
|
|||
},
|
||||
];
|
||||
|
||||
for Row { rules, separators, examples } in &table {
|
||||
for Row {
|
||||
rules,
|
||||
separators,
|
||||
examples,
|
||||
} in &table
|
||||
{
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: separators.clone(),
|
||||
variables: rules
|
||||
.into_iter()
|
||||
.map(|rule| Variable::named("", rule.clone()))
|
||||
.collect(),
|
||||
}).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
for (haystack, needle) in examples.iter() {
|
||||
assert_eq!(simulate_nfa(&grammar, haystack), *needle);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue