Integrate separator rules into lexer nfa
This commit is contained in:
parent
40d24097ec
commit
0103a83f3f
5 changed files with 199 additions and 123 deletions
|
|
@ -35,13 +35,13 @@ pub(crate) struct InputGrammar {
|
|||
pub(crate) struct LexicalVariable {
|
||||
pub name: String,
|
||||
pub kind: VariableType,
|
||||
pub nfa: Nfa,
|
||||
pub start_state: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub(crate) struct LexicalGrammar {
|
||||
pub nfa: Nfa,
|
||||
pub variables: Vec<LexicalVariable>,
|
||||
pub separators: Vec<Nfa>,
|
||||
}
|
||||
|
||||
// Extracted syntax grammar
|
||||
|
|
|
|||
44
src/nfa.rs
44
src/nfa.rs
|
|
@ -9,9 +9,13 @@ pub enum CharacterSet {
|
|||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum NfaState {
|
||||
Advance(CharacterSet, u32),
|
||||
Advance {
|
||||
chars: CharacterSet,
|
||||
state: u32,
|
||||
is_sep: bool,
|
||||
},
|
||||
Split(u32, u32),
|
||||
Accept,
|
||||
Accept(usize),
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
|
|
@ -23,6 +27,7 @@ pub struct Nfa {
|
|||
pub struct NfaCursor<'a> {
|
||||
indices: Vec<u32>,
|
||||
nfa: &'a Nfa,
|
||||
in_sep: bool,
|
||||
}
|
||||
|
||||
impl CharacterSet {
|
||||
|
|
@ -88,15 +93,15 @@ impl CharacterSet {
|
|||
|
||||
impl Nfa {
|
||||
pub fn new() -> Self {
|
||||
Nfa { states: vec![NfaState::Accept] }
|
||||
Nfa { states: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn start_index(&self) -> u32 {
|
||||
pub fn last_state(&self) -> u32 {
|
||||
self.states.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
|
||||
self.states.push(f(self.start_index()));
|
||||
self.states.push(f(self.last_state()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -116,38 +121,45 @@ impl fmt::Debug for Nfa {
|
|||
|
||||
impl<'a> NfaCursor<'a> {
|
||||
pub fn new(nfa: &'a Nfa) -> Self {
|
||||
let mut result = Self { nfa, indices: Vec::new() };
|
||||
result.add_indices(&mut vec![nfa.start_index()]);
|
||||
let mut result = Self { nfa, indices: Vec::new(), in_sep: true };
|
||||
result.add_states(&mut vec![nfa.last_state()]);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn advance(&mut self, c: char) -> bool {
|
||||
let mut result = false;
|
||||
let mut new_indices = Vec::new();
|
||||
let mut any_sep_transitions = false;
|
||||
for index in &self.indices {
|
||||
if let NfaState::Advance(chars, next_index) = &self.nfa.states[*index as usize] {
|
||||
if let NfaState::Advance { chars, state, is_sep } = &self.nfa.states[*index as usize] {
|
||||
if *is_sep {
|
||||
any_sep_transitions = true;
|
||||
}
|
||||
if chars.contains(c) {
|
||||
new_indices.push(*next_index);
|
||||
new_indices.push(*state);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !any_sep_transitions {
|
||||
self.in_sep = false;
|
||||
}
|
||||
self.indices.clear();
|
||||
self.add_indices(&mut new_indices);
|
||||
self.add_states(&mut new_indices);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
self.indices.iter().any(|index| {
|
||||
if let NfaState::Accept = self.nfa.states[*index as usize] {
|
||||
true
|
||||
pub fn finished_ids<'b>(&'b self) -> impl Iterator<Item = usize> + 'b {
|
||||
self.indices.iter().filter_map(move |index| {
|
||||
if let NfaState::Accept(i) = self.nfa.states[*index as usize] {
|
||||
Some(i)
|
||||
} else {
|
||||
false
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_indices(&mut self, new_indices: &mut Vec<u32>) {
|
||||
pub fn add_states(&mut self, new_indices: &mut Vec<u32>) {
|
||||
while let Some(index) = new_indices.pop() {
|
||||
let state = &self.nfa.states[index as usize];
|
||||
if let NfaState::Split(left, right) = state {
|
||||
|
|
|
|||
|
|
@ -39,40 +39,46 @@ fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
|
|||
}
|
||||
}
|
||||
|
||||
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
|
||||
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(()),
|
||||
Ast::Empty(_) => Ok(false),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
|
||||
Ast::Literal(literal) => {
|
||||
nfa.states.push(NfaState::Advance(
|
||||
CharacterSet::Include(vec![literal.c]),
|
||||
next_state_index,
|
||||
));
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: CharacterSet::Include(vec![literal.c]),
|
||||
state: next_state_index,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Dot(_) => {
|
||||
nfa.states.push(NfaState::Advance(
|
||||
CharacterSet::Exclude(vec!['\n']),
|
||||
next_state_index,
|
||||
));
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: CharacterSet::Exclude(vec!['\n']),
|
||||
state: next_state_index,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
|
||||
Ast::Class(class) => match class {
|
||||
Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
|
||||
Class::Perl(class) => {
|
||||
nfa.states.push(NfaState::Advance(
|
||||
expand_perl_character_class(&class.kind),
|
||||
next_state_index,
|
||||
));
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: expand_perl_character_class(&class.kind),
|
||||
state: next_state_index,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Class::Bracketed(class) => match &class.kind {
|
||||
ClassSet::Item(item) => {
|
||||
let character_set = expand_character_class(&item)?;
|
||||
nfa.states
|
||||
.push(NfaState::Advance(character_set, next_state_index));
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: character_set,
|
||||
state: next_state_index,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
ClassSet::BinaryOp(_) => Err(Error::regex(
|
||||
"Binary operators in character classes aren't supported",
|
||||
|
|
@ -81,134 +87,171 @@ fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_index: u32) -> Result<(
|
|||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
expand_regex(&repetition.ast, nfa, next_state_index)?;
|
||||
nfa.prepend(|start_index| NfaState::Split(next_state_index, start_index));
|
||||
Ok(())
|
||||
if expand_regex(&repetition.ast, nfa, next_state_index, is_sep)? {
|
||||
nfa.prepend(|last_state| NfaState::Split(next_state_index, last_state));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::OneOrMore => {
|
||||
nfa.states.push(NfaState::Accept); // Placeholder for split
|
||||
let split_index = nfa.start_index();
|
||||
expand_regex(&repetition.ast, nfa, split_index)?;
|
||||
nfa.states[split_index as usize] =
|
||||
NfaState::Split(nfa.start_index(), next_state_index);
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_index = nfa.last_state();
|
||||
if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
|
||||
nfa.states[split_index as usize] =
|
||||
NfaState::Split(nfa.last_state(), next_state_index);
|
||||
Ok(true)
|
||||
} else {
|
||||
nfa.states.pop();
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::ZeroOrMore => {
|
||||
nfa.states.push(NfaState::Accept); // Placeholder for split
|
||||
let split_index = nfa.start_index();
|
||||
expand_regex(&repetition.ast, nfa, split_index)?;
|
||||
nfa.states[split_index as usize] =
|
||||
NfaState::Split(nfa.start_index(), next_state_index);
|
||||
nfa.prepend(|start_index| NfaState::Split(start_index, next_state_index));
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_index = nfa.last_state();
|
||||
if expand_regex(&repetition.ast, nfa, split_index, is_sep)? {
|
||||
nfa.states[split_index as usize] =
|
||||
NfaState::Split(nfa.last_state(), next_state_index);
|
||||
nfa.prepend(|last_state| NfaState::Split(last_state, next_state_index));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::Range(_) => unimplemented!(),
|
||||
},
|
||||
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.start_index()),
|
||||
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state(), is_sep),
|
||||
Ast::Alternation(alternation) => {
|
||||
let mut alternative_start_indices = Vec::new();
|
||||
for ast in alternation.asts.iter() {
|
||||
expand_regex(&ast, nfa, next_state_index)?;
|
||||
alternative_start_indices.push(nfa.start_index());
|
||||
if expand_regex(&ast, nfa, next_state_index, is_sep)? {
|
||||
alternative_start_indices.push(nfa.last_state());
|
||||
}
|
||||
}
|
||||
alternative_start_indices.pop();
|
||||
for alternative_start_index in alternative_start_indices {
|
||||
nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
|
||||
nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
|
||||
}
|
||||
Ok(())
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Concat(concat) => {
|
||||
let mut result = false;
|
||||
for ast in concat.asts.iter().rev() {
|
||||
expand_regex(&ast, nfa, next_state_index)?;
|
||||
next_state_index = nfa.start_index();
|
||||
if expand_regex(&ast, nfa, next_state_index, is_sep)? {
|
||||
result = true;
|
||||
}
|
||||
next_state_index = nfa.last_state();
|
||||
}
|
||||
Ok(())
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_rule(rule: Rule, nfa: &mut Nfa, mut next_state_index: u32) -> Result<()> {
|
||||
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_index: u32, is_sep: bool) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new()
|
||||
.parse(&s)
|
||||
.map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
expand_regex(&ast, nfa, next_state_index)?;
|
||||
Ok(())
|
||||
expand_regex(&ast, nfa, next_state_index, is_sep)
|
||||
}
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
nfa.prepend(|start_index| {
|
||||
NfaState::Advance(CharacterSet::empty().add_char(c), start_index)
|
||||
nfa.prepend(|last_state| {
|
||||
NfaState::Advance {
|
||||
chars: CharacterSet::empty().add_char(c),
|
||||
state: last_state,
|
||||
is_sep,
|
||||
}
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
Ok(s.len() > 0)
|
||||
}
|
||||
Rule::Choice(elements) => {
|
||||
let mut alternative_start_indices = Vec::new();
|
||||
for element in elements {
|
||||
expand_rule(element, nfa, next_state_index)?;
|
||||
alternative_start_indices.push(nfa.start_index());
|
||||
if expand_rule(element, nfa, next_state_index, is_sep)? {
|
||||
alternative_start_indices.push(nfa.last_state());
|
||||
}
|
||||
}
|
||||
alternative_start_indices.pop();
|
||||
for alternative_start_index in alternative_start_indices {
|
||||
nfa.prepend(|start_index| NfaState::Split(start_index, alternative_start_index));
|
||||
nfa.prepend(|last_state| NfaState::Split(last_state, alternative_start_index));
|
||||
}
|
||||
Ok(())
|
||||
Ok(true)
|
||||
}
|
||||
Rule::Seq(elements) => {
|
||||
let mut result = false;
|
||||
for element in elements.into_iter().rev() {
|
||||
expand_rule(element, nfa, next_state_index)?;
|
||||
next_state_index = nfa.start_index();
|
||||
if expand_rule(element, nfa, next_state_index, is_sep)? {
|
||||
result = true;
|
||||
}
|
||||
next_state_index = nfa.last_state();
|
||||
}
|
||||
Ok(())
|
||||
Ok(result)
|
||||
}
|
||||
Rule::Repeat(rule) => {
|
||||
nfa.states.push(NfaState::Accept); // Placeholder for split
|
||||
let split_index = nfa.start_index();
|
||||
expand_rule(*rule, nfa, split_index)?;
|
||||
nfa.states[split_index as usize] = NfaState::Split(nfa.start_index(), next_state_index);
|
||||
Ok(())
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_index = nfa.last_state();
|
||||
if expand_rule(rule, nfa, split_index, is_sep)? {
|
||||
nfa.states[split_index as usize] = NfaState::Split(nfa.last_state(), next_state_index);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
_ => Err(Error::grammar("Unexpected rule type")),
|
||||
Rule::Blank => Ok(false),
|
||||
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn expand_tokens(grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut nfa = Nfa::new();
|
||||
|
||||
let separator_rule = if grammar.separators.len() > 0 {
|
||||
Rule::repeat(Rule::choice(grammar.separators))
|
||||
} else {
|
||||
Rule::Blank
|
||||
};
|
||||
|
||||
let mut variables = Vec::new();
|
||||
for variable in grammar.variables {
|
||||
let mut nfa = Nfa::new();
|
||||
expand_rule(variable.rule, &mut nfa, 0)?;
|
||||
for (i, variable) in grammar.variables.into_iter().enumerate() {
|
||||
let is_immediate_token = match &variable.rule {
|
||||
Rule::Metadata { params, .. } => params.is_main_token,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
nfa.states.push(NfaState::Accept(i));
|
||||
let last_state = nfa.last_state();
|
||||
expand_rule(&variable.rule, &mut nfa, last_state, false)?;
|
||||
|
||||
if !is_immediate_token {
|
||||
let last_state = nfa.last_state();
|
||||
expand_rule(&separator_rule, &mut nfa, last_state, true)?;
|
||||
}
|
||||
|
||||
variables.push(LexicalVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
nfa,
|
||||
start_state: nfa.last_state(),
|
||||
});
|
||||
}
|
||||
let mut separators = Vec::new();
|
||||
for separator in grammar.separators {
|
||||
let mut nfa = Nfa::new();
|
||||
expand_rule(separator, &mut nfa, 0)?;
|
||||
separators.push(nfa);
|
||||
}
|
||||
|
||||
Ok(LexicalGrammar {
|
||||
variables,
|
||||
separators,
|
||||
})
|
||||
Ok(LexicalGrammar { nfa, variables })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::nfa::NfaCursor;
|
||||
use crate::grammars::Variable;
|
||||
|
||||
fn simulate_nfa<'a>(nfa: &'a Nfa, s: &'a str) -> Option<&'a str> {
|
||||
let mut result = None;
|
||||
let mut char_count = 0;
|
||||
let mut cursor = NfaCursor::new(nfa);
|
||||
for c in s.chars() {
|
||||
if cursor.is_done() {
|
||||
if cursor.finished_ids().count() > 0 {
|
||||
result = Some(&s[0..char_count]);
|
||||
}
|
||||
if cursor.advance(c) {
|
||||
|
|
@ -223,13 +266,13 @@ mod tests {
|
|||
#[test]
|
||||
fn test_rule_expansion() {
|
||||
struct Row {
|
||||
rule: Rule,
|
||||
rules: Vec<Rule>,
|
||||
examples: Vec<(&'static str, Option<&'static str>)>,
|
||||
}
|
||||
|
||||
let table = [
|
||||
Row {
|
||||
rule: Rule::pattern("a|bc"),
|
||||
rules: vec![Rule::pattern("a|bc")],
|
||||
examples: vec![
|
||||
("a12", Some("a")),
|
||||
("bc12", Some("bc")),
|
||||
|
|
@ -238,7 +281,7 @@ mod tests {
|
|||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::pattern("(a|b|c)d(e|f|g)h?"),
|
||||
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
|
||||
examples: vec![
|
||||
("ade1", Some("ade")),
|
||||
("bdf1", Some("bdf")),
|
||||
|
|
@ -247,11 +290,14 @@ mod tests {
|
|||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::pattern("a*"),
|
||||
examples: vec![("aaa1", Some("aaa")), ("b", Some(""))],
|
||||
rules: vec![Rule::pattern("a*")],
|
||||
examples: vec![
|
||||
("aaa1", Some("aaa")),
|
||||
("b", Some("")),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::pattern("a((bc)+|(de)*)f"),
|
||||
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
|
||||
examples: vec![
|
||||
("af1", Some("af")),
|
||||
("adedef1", Some("adedef")),
|
||||
|
|
@ -260,32 +306,51 @@ mod tests {
|
|||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::pattern("[a-fA-F0-9]+"),
|
||||
examples: vec![("A1ff0", Some("A1ff"))],
|
||||
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
|
||||
examples: vec![
|
||||
("A1ff0", Some("A1ff")),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::pattern("\\w\\d\\s"),
|
||||
examples: vec![("_0 ", Some("_0 "))],
|
||||
rules: vec![Rule::pattern("\\w\\d\\s")],
|
||||
examples: vec![
|
||||
("_0 ", Some("_0 ")),
|
||||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::string("abc"),
|
||||
examples: vec![("abcd", Some("abc")), ("ab", None)],
|
||||
rules: vec![Rule::string("abc")],
|
||||
examples: vec![
|
||||
("abcd", Some("abc")),
|
||||
("ab", None)
|
||||
],
|
||||
},
|
||||
Row {
|
||||
rule: Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("{"),
|
||||
Rule::pattern("[a-f]+"),
|
||||
Rule::string("}"),
|
||||
])),
|
||||
examples: vec![("{a}{", Some("{a}")), ("{a}{d", Some("{a}")), ("ab", None)],
|
||||
rules: vec![
|
||||
Rule::repeat(Rule::seq(vec![
|
||||
Rule::string("{"),
|
||||
Rule::pattern("[a-f]+"),
|
||||
Rule::string("}"),
|
||||
])),
|
||||
],
|
||||
examples: vec![
|
||||
("{a}{", Some("{a}")),
|
||||
("{a}{d", Some("{a}")),
|
||||
("ab", None),
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for Row { rule, examples } in table.iter() {
|
||||
let mut nfa = Nfa::new();
|
||||
expand_rule(rule.clone(), &mut nfa, 0).unwrap();
|
||||
for Row { rules, examples } in &table {
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: vec![],
|
||||
variables: rules
|
||||
.into_iter()
|
||||
.map(|rule| Variable::named("", rule.clone()))
|
||||
.collect(),
|
||||
}).unwrap();
|
||||
|
||||
for (haystack, needle) in examples.iter() {
|
||||
assert_eq!(simulate_nfa(&nfa, haystack), *needle);
|
||||
assert_eq!(simulate_nfa(&grammar.nfa, haystack), *needle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -130,24 +130,24 @@ mod tests {
|
|||
};
|
||||
|
||||
let lexical_grammar = LexicalGrammar {
|
||||
nfa: Nfa::new(),
|
||||
variables: vec![
|
||||
LexicalVariable {
|
||||
name: "t1".to_string(),
|
||||
kind: VariableType::Anonymous,
|
||||
nfa: Nfa::new(),
|
||||
start_state: 0,
|
||||
},
|
||||
LexicalVariable {
|
||||
name: "t2".to_string(),
|
||||
kind: VariableType::Anonymous,
|
||||
nfa: Nfa::new(),
|
||||
start_state: 0,
|
||||
},
|
||||
LexicalVariable {
|
||||
name: "t3".to_string(),
|
||||
kind: VariableType::Anonymous,
|
||||
nfa: Nfa::new(),
|
||||
start_state: 0,
|
||||
}
|
||||
],
|
||||
separators: Vec::new(),
|
||||
};
|
||||
|
||||
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ pub(crate) struct MetadataParams {
|
|||
pub is_string: bool,
|
||||
pub is_active: bool,
|
||||
pub is_main_token: bool,
|
||||
pub is_excluded: bool,
|
||||
pub alias: Option<Alias>,
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue