Add handling of precedence within tokens
This commit is contained in:
parent
5258ee2e6a
commit
479400e5d3
3 changed files with 670 additions and 267 deletions
366
src/nfa.rs
366
src/nfa.rs
|
|
@ -1,5 +1,8 @@
|
|||
use std::fmt;
|
||||
use std::char;
|
||||
use std::cmp::max;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::mem::swap;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum CharacterSet {
|
||||
|
|
@ -13,14 +16,18 @@ pub enum NfaState {
|
|||
chars: CharacterSet,
|
||||
state_id: u32,
|
||||
is_sep: bool,
|
||||
precedence: i32,
|
||||
},
|
||||
Split(u32, u32),
|
||||
Accept(usize),
|
||||
Accept {
|
||||
variable_index: usize,
|
||||
precedence: i32,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub struct Nfa {
|
||||
pub states: Vec<NfaState>
|
||||
pub states: Vec<NfaState>,
|
||||
}
|
||||
|
||||
impl Default for Nfa {
|
||||
|
|
@ -78,14 +85,57 @@ impl CharacterSet {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn add(self, other: CharacterSet) -> Self {
|
||||
if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) {
|
||||
chars.extend(other_chars);
|
||||
chars.sort_unstable();
|
||||
chars.dedup();
|
||||
CharacterSet::Include(chars)
|
||||
pub fn add(self, other: &CharacterSet) -> Self {
|
||||
if let CharacterSet::Include(other_chars) = other {
|
||||
if let CharacterSet::Include(mut chars) = self {
|
||||
chars.extend(other_chars);
|
||||
chars.sort_unstable();
|
||||
chars.dedup();
|
||||
return CharacterSet::Include(chars);
|
||||
}
|
||||
}
|
||||
panic!("Called add with a negated character set");
|
||||
}
|
||||
|
||||
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
|
||||
match self {
|
||||
CharacterSet::Include(chars) => match other {
|
||||
CharacterSet::Include(other_chars) => {
|
||||
CharacterSet::Include(remove_chars(chars, other_chars, true))
|
||||
}
|
||||
CharacterSet::Exclude(other_chars) => {
|
||||
let mut removed = remove_chars(chars, other_chars, false);
|
||||
add_chars(other_chars, chars);
|
||||
swap(&mut removed, chars);
|
||||
CharacterSet::Include(removed)
|
||||
}
|
||||
},
|
||||
CharacterSet::Exclude(chars) => match other {
|
||||
CharacterSet::Include(other_chars) => {
|
||||
let mut removed = remove_chars(other_chars, chars, false);
|
||||
add_chars(chars, other_chars);
|
||||
swap(&mut removed, other_chars);
|
||||
CharacterSet::Include(removed)
|
||||
}
|
||||
CharacterSet::Exclude(other_chars) => {
|
||||
let removed = remove_chars(chars, other_chars, true);
|
||||
let mut included_characters = Vec::new();
|
||||
let mut other_included_characters = Vec::new();
|
||||
swap(&mut included_characters, other_chars);
|
||||
swap(&mut other_included_characters, chars);
|
||||
*self = CharacterSet::Include(included_characters);
|
||||
*other = CharacterSet::Include(other_included_characters);
|
||||
CharacterSet::Exclude(removed)
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
if let CharacterSet::Include(c) = self {
|
||||
c.is_empty()
|
||||
} else {
|
||||
panic!("Called add with a negated character set");
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -97,6 +147,84 @@ impl CharacterSet {
|
|||
}
|
||||
}
|
||||
|
||||
impl Ord for CharacterSet {
|
||||
fn cmp(&self, other: &CharacterSet) -> Ordering {
|
||||
match self {
|
||||
CharacterSet::Include(chars) => {
|
||||
if let CharacterSet::Include(other_chars) = other {
|
||||
compare_chars(chars, other_chars)
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
}
|
||||
CharacterSet::Exclude(chars) => {
|
||||
if let CharacterSet::Exclude(other_chars) = other {
|
||||
compare_chars(chars, other_chars)
|
||||
} else {
|
||||
Ordering::Greater
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for CharacterSet {
|
||||
fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
|
||||
for c in right {
|
||||
match left.binary_search(c) {
|
||||
Err(i) => left.insert(i, *c),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
|
||||
let mut result = Vec::new();
|
||||
right.retain(|right_char| {
|
||||
if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
|
||||
left.remove(index);
|
||||
result.push(*right_char);
|
||||
false || !mutate_right
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
result
|
||||
}
|
||||
|
||||
fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
|
||||
if chars.is_empty() {
|
||||
if other_chars.is_empty() {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
} else if other_chars.is_empty() {
|
||||
Ordering::Greater
|
||||
} else {
|
||||
let mut other_c = other_chars.iter();
|
||||
for c in chars.iter() {
|
||||
if let Some(other_c) = other_c.next() {
|
||||
let cmp = c.cmp(other_c);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
} else {
|
||||
return Ordering::Greater;
|
||||
}
|
||||
}
|
||||
if other_c.next().is_some() {
|
||||
return Ordering::Less;
|
||||
}
|
||||
Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl Nfa {
|
||||
pub fn new() -> Self {
|
||||
Nfa { states: Vec::new() }
|
||||
|
|
@ -124,17 +252,32 @@ impl fmt::Debug for Nfa {
|
|||
|
||||
impl<'a> NfaCursor<'a> {
|
||||
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
|
||||
let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true };
|
||||
let mut result = Self {
|
||||
nfa,
|
||||
state_ids: Vec::new(),
|
||||
in_sep: true,
|
||||
};
|
||||
result.add_states(&mut states);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, mut states: Vec<u32>) {
|
||||
self.state_ids.clear();
|
||||
self.add_states(&mut states);
|
||||
}
|
||||
|
||||
pub fn advance(&mut self, c: char) -> bool {
|
||||
let mut result = false;
|
||||
let mut new_state_ids = Vec::new();
|
||||
let mut any_sep_transitions = false;
|
||||
for current_state_id in &self.state_ids {
|
||||
if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] {
|
||||
if let NfaState::Advance {
|
||||
chars,
|
||||
state_id,
|
||||
is_sep,
|
||||
..
|
||||
} = &self.nfa.states[*current_state_id as usize]
|
||||
{
|
||||
if chars.contains(c) {
|
||||
if *is_sep {
|
||||
any_sep_transitions = true;
|
||||
|
|
@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> {
|
|||
result
|
||||
}
|
||||
|
||||
pub fn finished_id(&self) -> Option<usize> {
|
||||
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
|
||||
self.state_ids.iter().filter_map(move |id| {
|
||||
if let NfaState::Advance {
|
||||
chars,
|
||||
state_id,
|
||||
precedence,
|
||||
..
|
||||
} = &self.nfa.states[*id as usize]
|
||||
{
|
||||
Some((chars, *precedence, *state_id))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
|
||||
Self::group_successors(self.successors())
|
||||
}
|
||||
|
||||
fn group_successors<'b>(
|
||||
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
|
||||
) -> Vec<(CharacterSet, i32, Vec<u32>)> {
|
||||
let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
|
||||
for (chars, prec, state) in iter {
|
||||
let mut chars = chars.clone();
|
||||
let mut i = 0;
|
||||
while i < result.len() {
|
||||
let intersection = result[i].0.remove_intersection(&mut chars);
|
||||
if !intersection.is_empty() {
|
||||
let mut states = result[i].2.clone();
|
||||
let mut precedence = result[i].1;
|
||||
states.push(state);
|
||||
result.insert(i, (intersection, max(precedence, prec), states));
|
||||
i += 1;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
if !chars.is_empty() {
|
||||
result.push((chars, prec, vec![state]));
|
||||
}
|
||||
}
|
||||
result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
result
|
||||
}
|
||||
|
||||
pub fn finished_id(&self) -> Option<(usize, i32)> {
|
||||
let mut result = None;
|
||||
for state_id in self.state_ids.iter() {
|
||||
if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] {
|
||||
if let NfaState::Accept {
|
||||
variable_index,
|
||||
precedence,
|
||||
} = self.nfa.states[*state_id as usize]
|
||||
{
|
||||
match result {
|
||||
None => {
|
||||
result = Some(id)
|
||||
},
|
||||
Some(existing_id) => if id < existing_id {
|
||||
result = Some(id)
|
||||
None => result = Some((variable_index, precedence)),
|
||||
Some((existing_id, existing_precedence)) => {
|
||||
if precedence > existing_precedence
|
||||
|| (precedence == existing_precedence && variable_index < existing_id)
|
||||
{
|
||||
result = Some((variable_index, precedence))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_group_successors() {
|
||||
let table = [
|
||||
(
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'f'), 0, 1),
|
||||
(CharacterSet::empty().add_range('d', 'i'), 1, 2),
|
||||
],
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
|
||||
(CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
|
||||
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
|
||||
],
|
||||
),
|
||||
(
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'z'), 0, 1),
|
||||
(CharacterSet::empty().add_char('d'), 0, 2),
|
||||
(CharacterSet::empty().add_char('i'), 0, 3),
|
||||
(CharacterSet::empty().add_char('f'), 0, 4),
|
||||
],
|
||||
vec![
|
||||
(
|
||||
CharacterSet::empty()
|
||||
.add_range('a', 'c')
|
||||
.add_char('e')
|
||||
.add_range('g', 'h')
|
||||
.add_range('j', 'z'),
|
||||
0,
|
||||
vec![1],
|
||||
),
|
||||
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
|
||||
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
|
||||
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
|
||||
],
|
||||
),
|
||||
];
|
||||
|
||||
for row in table.iter() {
|
||||
assert_eq!(
|
||||
NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
|
||||
row.1
|
||||
);
|
||||
}
|
||||
|
||||
// let successors = NfaCursor::group_successors(
|
||||
// [
|
||||
// (&CharacterSet::empty().add_range('a', 'f'), 1),
|
||||
// (&CharacterSet::empty().add_range('d', 'i'), 2),
|
||||
// ]
|
||||
// .iter()
|
||||
// .cloned(),
|
||||
// );
|
||||
//
|
||||
// assert_eq!(
|
||||
// successors,
|
||||
// vec![
|
||||
// (CharacterSet::empty().add_range('a', 'c'), vec![1],),
|
||||
// (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
|
||||
// (CharacterSet::empty().add_range('g', 'i'), vec![2],),
|
||||
// ]
|
||||
// );
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_character_set_intersection() {
|
||||
// whitelist - whitelist
|
||||
// both sets contain 'c', 'd', and 'f'
|
||||
let mut a = CharacterSet::empty().add_range('a', 'f');
|
||||
let mut b = CharacterSet::empty().add_range('c', 'h');
|
||||
assert_eq!(
|
||||
a.remove_intersection(&mut b),
|
||||
CharacterSet::empty().add_range('c', 'f')
|
||||
);
|
||||
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
|
||||
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
|
||||
|
||||
let mut a = CharacterSet::empty().add_range('a', 'f');
|
||||
let mut b = CharacterSet::empty().add_range('c', 'h');
|
||||
assert_eq!(
|
||||
b.remove_intersection(&mut a),
|
||||
CharacterSet::empty().add_range('c', 'f')
|
||||
);
|
||||
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
|
||||
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
|
||||
|
||||
// whitelist - blacklist
|
||||
// both sets contain 'e', 'f', and 'm'
|
||||
let mut a = CharacterSet::empty()
|
||||
.add_range('c', 'h')
|
||||
.add_range('k', 'm');
|
||||
let mut b = CharacterSet::empty()
|
||||
.add_range('a', 'd')
|
||||
.add_range('g', 'l')
|
||||
.negate();
|
||||
assert_eq!(
|
||||
a.remove_intersection(&mut b),
|
||||
CharacterSet::Include(vec!['e', 'f', 'm'])
|
||||
);
|
||||
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
|
||||
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
|
||||
|
||||
let mut a = CharacterSet::empty()
|
||||
.add_range('c', 'h')
|
||||
.add_range('k', 'm');
|
||||
let mut b = CharacterSet::empty()
|
||||
.add_range('a', 'd')
|
||||
.add_range('g', 'l')
|
||||
.negate();
|
||||
assert_eq!(
|
||||
b.remove_intersection(&mut a),
|
||||
CharacterSet::Include(vec!['e', 'f', 'm'])
|
||||
);
|
||||
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
|
||||
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
|
||||
|
||||
// blacklist - blacklist
|
||||
// both sets exclude 'c', 'd', and 'e'
|
||||
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
|
||||
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
|
||||
assert_eq!(
|
||||
a.remove_intersection(&mut b),
|
||||
CharacterSet::Exclude(vec!['c', 'd', 'e'])
|
||||
);
|
||||
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
|
||||
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,18 @@ use regex_syntax::ast::{
|
|||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
|
||||
};
|
||||
|
||||
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut nfa = Nfa::new();
|
||||
struct NfaBuilder {
|
||||
nfa: Nfa,
|
||||
is_sep: bool,
|
||||
precedence_stack: Vec<i32>,
|
||||
}
|
||||
|
||||
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
|
||||
let mut builder = NfaBuilder {
|
||||
nfa: Nfa::new(),
|
||||
is_sep: true,
|
||||
precedence_stack: vec![0],
|
||||
};
|
||||
|
||||
let separator_rule = if grammar.separators.len() > 0 {
|
||||
grammar.separators.push(Rule::Blank);
|
||||
|
|
@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
|
|||
_ => false,
|
||||
};
|
||||
|
||||
nfa.states.push(NfaState::Accept(i));
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e {
|
||||
Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)),
|
||||
_ => e,
|
||||
})?;
|
||||
builder.is_sep = false;
|
||||
builder.nfa.states.push(NfaState::Accept {
|
||||
variable_index: i,
|
||||
precedence: 0,
|
||||
});
|
||||
let last_state_id = builder.nfa.last_state_id();
|
||||
builder
|
||||
.expand_rule(&variable.rule, last_state_id)
|
||||
.map_err(|e| match e {
|
||||
Error::RegexError(msg) => {
|
||||
Error::RegexError(format!("Rule {} {}", variable.name, msg))
|
||||
}
|
||||
_ => e,
|
||||
})?;
|
||||
|
||||
if !is_immediate_token {
|
||||
let last_state_id = nfa.last_state_id();
|
||||
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
|
||||
builder.is_sep = true;
|
||||
let last_state_id = builder.nfa.last_state_id();
|
||||
builder.expand_rule(&separator_rule, last_state_id)?;
|
||||
}
|
||||
|
||||
variables.push(LexicalVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
start_state: nfa.last_state_id(),
|
||||
start_state: builder.nfa.last_state_id(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(LexicalGrammar { nfa, variables })
|
||||
Ok(LexicalGrammar {
|
||||
nfa: builder.nfa,
|
||||
variables,
|
||||
})
|
||||
}
|
||||
|
||||
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new()
|
||||
.parse(&s)
|
||||
.map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
expand_regex(&ast, nfa, next_state_id, is_sep)
|
||||
}
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
nfa.prepend(|last_state_id| NfaState::Advance {
|
||||
chars: CharacterSet::empty().add_char(c),
|
||||
state_id: last_state_id,
|
||||
is_sep,
|
||||
});
|
||||
impl NfaBuilder {
|
||||
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Pattern(s) => {
|
||||
let ast = parse::Parser::new()
|
||||
.parse(&s)
|
||||
.map_err(|e| Error::GrammarError(e.to_string()))?;
|
||||
self.expand_regex(&ast, next_state_id)
|
||||
}
|
||||
Ok(s.len() > 0)
|
||||
}
|
||||
Rule::Choice(elements) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for element in elements {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
alternative_state_ids.push(nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
Rule::String(s) => {
|
||||
for c in s.chars().rev() {
|
||||
self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id());
|
||||
}
|
||||
Ok(s.len() > 0)
|
||||
}
|
||||
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
Rule::Seq(elements) => {
|
||||
let mut result = false;
|
||||
for element in elements.into_iter().rev() {
|
||||
if expand_rule(element, nfa, next_state_id, is_sep)? {
|
||||
result = true;
|
||||
Rule::Choice(elements) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for element in elements {
|
||||
if self.expand_rule(element, next_state_id)? {
|
||||
alternative_state_ids.push(self.nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
}
|
||||
}
|
||||
next_state_id = nfa.last_state_id();
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
Rule::Repeat(rule) => {
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_state_id = nfa.last_state_id();
|
||||
if expand_rule(rule, nfa, split_state_id, is_sep)? {
|
||||
nfa.states[split_state_id as usize] =
|
||||
NfaState::Split(nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
Rule::Metadata { rule, .. } => {
|
||||
// TODO - implement precedence
|
||||
expand_rule(rule, nfa, next_state_id, is_sep)
|
||||
}
|
||||
Rule::Blank => Ok(false),
|
||||
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
|
||||
let split_state_id = nfa.last_state_id();
|
||||
if expand_regex(&ast, nfa, split_state_id, is_sep)? {
|
||||
nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
nfa.states.pop();
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
if expand_regex(ast, nfa, next_state_id, is_sep)? {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_count(
|
||||
ast: &Ast,
|
||||
count: u32,
|
||||
nfa: &mut Nfa,
|
||||
mut next_state_id: u32,
|
||||
is_sep: bool,
|
||||
) -> Result<bool> {
|
||||
let mut result = false;
|
||||
for _ in 0..count {
|
||||
if expand_regex(ast, nfa, next_state_id, is_sep)? {
|
||||
result = true;
|
||||
next_state_id = nfa.last_state_id();
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(false),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
|
||||
Ast::Literal(literal) => {
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: CharacterSet::Include(vec![literal.c]),
|
||||
state_id: next_state_id,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Dot(_) => {
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: CharacterSet::Exclude(vec!['\n']),
|
||||
state_id: next_state_id,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
|
||||
Ast::Class(class) => match class {
|
||||
Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
|
||||
Class::Perl(class) => {
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: expand_perl_character_class(&class.kind),
|
||||
state_id: next_state_id,
|
||||
is_sep,
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
Class::Bracketed(class) => match &class.kind {
|
||||
ClassSet::Item(item) => {
|
||||
let character_set = expand_character_class(&item)?;
|
||||
nfa.states.push(NfaState::Advance {
|
||||
chars: character_set,
|
||||
state_id: next_state_id,
|
||||
is_sep,
|
||||
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
self.nfa.prepend(|last_state_id| {
|
||||
NfaState::Split(last_state_id, alternative_state_id)
|
||||
});
|
||||
Ok(true)
|
||||
}
|
||||
ClassSet::BinaryOp(_) => Err(Error::regex(
|
||||
"Binary operators in character classes aren't supported",
|
||||
)),
|
||||
},
|
||||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)
|
||||
Ok(true)
|
||||
}
|
||||
RepetitionKind::OneOrMore => {
|
||||
expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep)
|
||||
Rule::Seq(elements) => {
|
||||
let mut result = false;
|
||||
for element in elements.into_iter().rev() {
|
||||
if self.expand_rule(element, next_state_id)? {
|
||||
result = true;
|
||||
}
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
RepetitionKind::ZeroOrMore => {
|
||||
expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
|
||||
expand_count(&repetition.ast, count, nfa, next_state_id, is_sep)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
|
||||
if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? {
|
||||
expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)
|
||||
Rule::Repeat(rule) => {
|
||||
self.nfa.states.push(NfaState::Accept {
|
||||
variable_index: 0,
|
||||
precedence: 0,
|
||||
}); // Placeholder for split
|
||||
let split_state_id = self.nfa.last_state_id();
|
||||
if self.expand_rule(rule, split_state_id)? {
|
||||
self.nfa.states[split_state_id as usize] =
|
||||
NfaState::Split(self.nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
|
||||
let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?;
|
||||
for _ in min..max {
|
||||
if result {
|
||||
next_state_id = nfa.last_state_id();
|
||||
Rule::Metadata { rule, params } => {
|
||||
if let Some(precedence) = params.precedence {
|
||||
self.precedence_stack.push(precedence);
|
||||
}
|
||||
let result = self.expand_rule(rule, next_state_id);
|
||||
if params.precedence.is_some() {
|
||||
self.precedence_stack.pop();
|
||||
}
|
||||
result
|
||||
}
|
||||
Rule::Blank => Ok(false),
|
||||
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(false),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
|
||||
Ast::Literal(literal) => {
|
||||
self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Dot(_) => {
|
||||
self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
|
||||
Ast::Class(class) => match class {
|
||||
Class::Unicode(_) => {
|
||||
Err(Error::regex("Unicode character classes are not supported"))
|
||||
}
|
||||
Class::Perl(class) => {
|
||||
self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Class::Bracketed(class) => match &class.kind {
|
||||
ClassSet::Item(item) => {
|
||||
self.push_advance(self.expand_character_class(&item)?, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? {
|
||||
ClassSet::BinaryOp(_) => Err(Error::regex(
|
||||
"Binary operators in character classes aren't supported",
|
||||
)),
|
||||
},
|
||||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
self.expand_zero_or_one(&repetition.ast, next_state_id)
|
||||
}
|
||||
RepetitionKind::OneOrMore => {
|
||||
self.expand_one_or_more(&repetition.ast, next_state_id)
|
||||
}
|
||||
RepetitionKind::ZeroOrMore => {
|
||||
self.expand_zero_or_more(&repetition.ast, next_state_id)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
|
||||
self.expand_count(&repetition.ast, count, next_state_id)
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
|
||||
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
|
||||
self.expand_count(&repetition.ast, min, next_state_id)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
|
||||
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
|
||||
for _ in min..max {
|
||||
if result {
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
},
|
||||
Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()),
|
||||
Ast::Alternation(alternation) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for ast in alternation.asts.iter() {
|
||||
if self.expand_regex(&ast, next_state_id)? {
|
||||
alternative_state_ids.push(self.nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
}
|
||||
}
|
||||
alternative_state_ids.sort_unstable();
|
||||
alternative_state_ids.dedup();
|
||||
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
|
||||
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
self.nfa.prepend(|last_state_id| {
|
||||
NfaState::Split(last_state_id, alternative_state_id)
|
||||
});
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Concat(concat) => {
|
||||
let mut result = false;
|
||||
for ast in concat.asts.iter().rev() {
|
||||
if self.expand_regex(&ast, next_state_id)? {
|
||||
result = true;
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
},
|
||||
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep),
|
||||
Ast::Alternation(alternation) => {
|
||||
let mut alternative_state_ids = Vec::new();
|
||||
for ast in alternation.asts.iter() {
|
||||
if expand_regex(&ast, nfa, next_state_id, is_sep)? {
|
||||
alternative_state_ids.push(nfa.last_state_id());
|
||||
} else {
|
||||
alternative_state_ids.push(next_state_id);
|
||||
}
|
||||
}
|
||||
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
|
||||
for alternative_state_id in alternative_state_ids {
|
||||
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
self.nfa.states.push(NfaState::Accept {
|
||||
variable_index: 0,
|
||||
precedence: 0,
|
||||
}); // Placeholder for split
|
||||
let split_state_id = self.nfa.last_state_id();
|
||||
if self.expand_regex(&ast, split_state_id)? {
|
||||
self.nfa.states[split_state_id as usize] =
|
||||
NfaState::Split(self.nfa.last_state_id(), next_state_id);
|
||||
Ok(true)
|
||||
} else {
|
||||
self.nfa.states.pop();
|
||||
Ok(false)
|
||||
}
|
||||
Ast::Concat(concat) => {
|
||||
let mut result = false;
|
||||
for ast in concat.asts.iter().rev() {
|
||||
if expand_regex(&ast, nfa, next_state_id, is_sep)? {
|
||||
result = true;
|
||||
next_state_id = nfa.last_state_id();
|
||||
}
|
||||
|
||||
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
if self.expand_regex(ast, next_state_id)? {
|
||||
self.nfa
|
||||
.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
if self.expand_one_or_more(&ast, next_state_id)? {
|
||||
self.nfa
|
||||
.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
|
||||
let mut result = false;
|
||||
for _ in 0..count {
|
||||
if self.expand_regex(ast, next_state_id)? {
|
||||
result = true;
|
||||
next_state_id = self.nfa.last_state_id();
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
|
||||
match item {
|
||||
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
|
||||
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
|
||||
ClassSetItem::Range(range) => {
|
||||
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
|
||||
}
|
||||
ClassSetItem::Union(union) => {
|
||||
let mut result = CharacterSet::empty();
|
||||
for item in &union.items {
|
||||
result = result.add(&self.expand_character_class(&item)?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
Ok(result)
|
||||
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
|
||||
_ => Err(Error::regex(&format!(
|
||||
"Unsupported character class syntax {:?}",
|
||||
item
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
|
||||
match item {
|
||||
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
|
||||
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
|
||||
ClassSetItem::Range(range) => {
|
||||
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
|
||||
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
|
||||
ClassPerlKind::Space => CharacterSet::empty()
|
||||
.add_char(' ')
|
||||
.add_char('\t')
|
||||
.add_char('\r')
|
||||
.add_char('\n'),
|
||||
ClassPerlKind::Word => CharacterSet::empty()
|
||||
.add_char('_')
|
||||
.add_range('A', 'Z')
|
||||
.add_range('a', 'z')
|
||||
.add_range('0', '9'),
|
||||
}
|
||||
ClassSetItem::Union(union) => {
|
||||
let mut result = CharacterSet::empty();
|
||||
for item in &union.items {
|
||||
result = result.add(expand_character_class(&item)?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)),
|
||||
_ => Err(Error::regex(&format!(
|
||||
"Unsupported character class syntax {:?}",
|
||||
item
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
|
||||
ClassPerlKind::Space => CharacterSet::empty()
|
||||
.add_char(' ')
|
||||
.add_char('\t')
|
||||
.add_char('\r')
|
||||
.add_char('\n'),
|
||||
ClassPerlKind::Word => CharacterSet::empty()
|
||||
.add_char('_')
|
||||
.add_range('A', 'Z')
|
||||
.add_range('a', 'z')
|
||||
.add_range('0', '9'),
|
||||
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
|
||||
let precedence = *self.precedence_stack.last().unwrap();
|
||||
self.add_precedence(precedence, vec![state_id]);
|
||||
self.nfa.states.push(NfaState::Advance {
|
||||
chars,
|
||||
state_id,
|
||||
precedence,
|
||||
is_sep: self.is_sep,
|
||||
});
|
||||
}
|
||||
|
||||
fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
|
||||
let mut i = 0;
|
||||
while i < state_ids.len() {
|
||||
let state_id = state_ids[i];
|
||||
let (left, right) = match &mut self.nfa.states[state_id as usize] {
|
||||
NfaState::Accept {precedence, ..} => {
|
||||
*precedence = prec;
|
||||
return;
|
||||
},
|
||||
NfaState::Split(left, right) => (*left, *right),
|
||||
_ => return
|
||||
};
|
||||
if !state_ids.contains(&left) {
|
||||
state_ids.push(left);
|
||||
}
|
||||
if !state_ids.contains(&right) {
|
||||
state_ids.push(right);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -313,11 +367,15 @@ mod tests {
|
|||
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
|
||||
|
||||
let mut result = None;
|
||||
let mut result_precedence = 0;
|
||||
let mut start_char = 0;
|
||||
let mut end_char = 0;
|
||||
for c in s.chars() {
|
||||
if let Some(id) = cursor.finished_id() {
|
||||
result = Some((id, &s[start_char..end_char]));
|
||||
if let Some((id, finished_precedence)) = cursor.finished_id() {
|
||||
if result.is_none() || result_precedence <= finished_precedence {
|
||||
result = Some((id, &s[start_char..end_char]));
|
||||
result_precedence = finished_precedence;
|
||||
}
|
||||
}
|
||||
if cursor.advance(c) {
|
||||
end_char += 1;
|
||||
|
|
@ -329,8 +387,11 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
if let Some(id) = cursor.finished_id() {
|
||||
result = Some((id, &s[start_char..end_char]));
|
||||
if let Some((id, finished_precedence)) = cursor.finished_id() {
|
||||
if result.is_none() || result_precedence <= finished_precedence {
|
||||
result = Some((id, &s[start_char..end_char]));
|
||||
result_precedence = finished_precedence;
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
|
|
@ -443,6 +504,20 @@ mod tests {
|
|||
(" \\\na", Some((0, "a"))),
|
||||
],
|
||||
},
|
||||
// shorter tokens with higher precedence
|
||||
Row {
|
||||
rules: vec![
|
||||
Rule::prec(2, Rule::pattern("abc")),
|
||||
Rule::prec(1, Rule::pattern("ab[cd]e")),
|
||||
Rule::pattern("[a-e]+"),
|
||||
],
|
||||
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
|
||||
examples: vec![
|
||||
("abceef", Some((0, "abc"))),
|
||||
("abdeef", Some((1, "abde"))),
|
||||
("aeeeef", Some((2, "aeeee"))),
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for Row {
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ mod intern_symbols;
|
|||
mod process_inlines;
|
||||
|
||||
use self::expand_repeats::expand_repeats;
|
||||
use self::expand_tokens::expand_tokens;
|
||||
pub(crate) use self::expand_tokens::expand_tokens;
|
||||
use self::extract_simple_aliases::extract_simple_aliases;
|
||||
use self::extract_tokens::extract_tokens;
|
||||
use self::flatten_grammar::flatten_grammar;
|
||||
|
|
@ -19,7 +19,7 @@ use crate::grammars::{
|
|||
};
|
||||
use crate::rules::{AliasMap, Rule, Symbol};
|
||||
|
||||
pub(self) struct IntermediateGrammar<T, U> {
|
||||
pub(crate) struct IntermediateGrammar<T, U> {
|
||||
variables: Vec<Variable>,
|
||||
extra_tokens: Vec<T>,
|
||||
expected_conflicts: Vec<Vec<Symbol>>,
|
||||
|
|
@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar<T, U> {
|
|||
word_token: Option<Symbol>,
|
||||
}
|
||||
|
||||
pub(self) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
|
||||
pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
|
||||
|
||||
pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
|
||||
pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub(self) struct ExtractedLexicalGrammar {
|
||||
variables: Vec<Variable>,
|
||||
separators: Vec<Rule>,
|
||||
pub(crate) struct ExtractedLexicalGrammar {
|
||||
pub variables: Vec<Variable>,
|
||||
pub separators: Vec<Rule>,
|
||||
}
|
||||
|
||||
pub(crate) fn prepare_grammar(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue