Add handling of precedence within tokens

This commit is contained in:
Max Brunsfeld 2018-12-29 13:56:00 -08:00
parent 5258ee2e6a
commit 479400e5d3
3 changed files with 670 additions and 267 deletions

View file

@ -1,5 +1,8 @@
use std::fmt;
use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::fmt;
use std::mem::swap;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CharacterSet {
@ -13,14 +16,18 @@ pub enum NfaState {
chars: CharacterSet,
state_id: u32,
is_sep: bool,
precedence: i32,
},
Split(u32, u32),
Accept(usize),
Accept {
variable_index: usize,
precedence: i32,
},
}
#[derive(PartialEq, Eq)]
pub struct Nfa {
pub states: Vec<NfaState>
pub states: Vec<NfaState>,
}
impl Default for Nfa {
@ -78,14 +85,57 @@ impl CharacterSet {
}
}
pub fn add(self, other: CharacterSet) -> Self {
if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
pub fn add(self, other: &CharacterSet) -> Self {
if let CharacterSet::Include(other_chars) = other {
if let CharacterSet::Include(mut chars) = self {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
return CharacterSet::Include(chars);
}
}
panic!("Called add with a negated character set");
}
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => {
CharacterSet::Include(remove_chars(chars, other_chars, true))
}
CharacterSet::Exclude(other_chars) => {
let mut removed = remove_chars(chars, other_chars, false);
add_chars(other_chars, chars);
swap(&mut removed, chars);
CharacterSet::Include(removed)
}
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => {
let mut removed = remove_chars(other_chars, chars, false);
add_chars(chars, other_chars);
swap(&mut removed, other_chars);
CharacterSet::Include(removed)
}
CharacterSet::Exclude(other_chars) => {
let removed = remove_chars(chars, other_chars, true);
let mut included_characters = Vec::new();
let mut other_included_characters = Vec::new();
swap(&mut included_characters, other_chars);
swap(&mut other_included_characters, chars);
*self = CharacterSet::Include(included_characters);
*other = CharacterSet::Include(other_included_characters);
CharacterSet::Exclude(removed)
}
},
}
}
pub fn is_empty(&self) -> bool {
if let CharacterSet::Include(c) = self {
c.is_empty()
} else {
panic!("Called add with a negated character set");
false
}
}
@ -97,6 +147,84 @@ impl CharacterSet {
}
}
impl Ord for CharacterSet {
fn cmp(&self, other: &CharacterSet) -> Ordering {
match self {
CharacterSet::Include(chars) => {
if let CharacterSet::Include(other_chars) = other {
compare_chars(chars, other_chars)
} else {
Ordering::Less
}
}
CharacterSet::Exclude(chars) => {
if let CharacterSet::Exclude(other_chars) = other {
compare_chars(chars, other_chars)
} else {
Ordering::Greater
}
}
}
}
}
impl PartialOrd for CharacterSet {
fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
Some(self.cmp(other))
}
}
fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
for c in right {
match left.binary_search(c) {
Err(i) => left.insert(i, *c),
_ => {}
}
}
}
fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
let mut result = Vec::new();
right.retain(|right_char| {
if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
left.remove(index);
result.push(*right_char);
false || !mutate_right
} else {
true
}
});
result
}
fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
if chars.is_empty() {
if other_chars.is_empty() {
Ordering::Equal
} else {
Ordering::Less
}
} else if other_chars.is_empty() {
Ordering::Greater
} else {
let mut other_c = other_chars.iter();
for c in chars.iter() {
if let Some(other_c) = other_c.next() {
let cmp = c.cmp(other_c);
if cmp != Ordering::Equal {
return cmp;
}
} else {
return Ordering::Greater;
}
}
if other_c.next().is_some() {
return Ordering::Less;
}
Ordering::Equal
}
}
impl Nfa {
pub fn new() -> Self {
Nfa { states: Vec::new() }
@ -124,17 +252,32 @@ impl fmt::Debug for Nfa {
impl<'a> NfaCursor<'a> {
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true };
let mut result = Self {
nfa,
state_ids: Vec::new(),
in_sep: true,
};
result.add_states(&mut states);
result
}
pub fn reset(&mut self, mut states: Vec<u32>) {
self.state_ids.clear();
self.add_states(&mut states);
}
pub fn advance(&mut self, c: char) -> bool {
let mut result = false;
let mut new_state_ids = Vec::new();
let mut any_sep_transitions = false;
for current_state_id in &self.state_ids {
if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] {
if let NfaState::Advance {
chars,
state_id,
is_sep,
..
} = &self.nfa.states[*current_state_id as usize]
{
if chars.contains(c) {
if *is_sep {
any_sep_transitions = true;
@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> {
result
}
pub fn finished_id(&self) -> Option<usize> {
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
self.state_ids.iter().filter_map(move |id| {
if let NfaState::Advance {
chars,
state_id,
precedence,
..
} = &self.nfa.states[*id as usize]
{
Some((chars, *precedence, *state_id))
} else {
None
}
})
}
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
Self::group_successors(self.successors())
}
fn group_successors<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
) -> Vec<(CharacterSet, i32, Vec<u32>)> {
let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
for (chars, prec, state) in iter {
let mut chars = chars.clone();
let mut i = 0;
while i < result.len() {
let intersection = result[i].0.remove_intersection(&mut chars);
if !intersection.is_empty() {
let mut states = result[i].2.clone();
let mut precedence = result[i].1;
states.push(state);
result.insert(i, (intersection, max(precedence, prec), states));
i += 1;
}
i += 1;
}
if !chars.is_empty() {
result.push((chars, prec, vec![state]));
}
}
result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
result
}
pub fn finished_id(&self) -> Option<(usize, i32)> {
let mut result = None;
for state_id in self.state_ids.iter() {
if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] {
if let NfaState::Accept {
variable_index,
precedence,
} = self.nfa.states[*state_id as usize]
{
match result {
None => {
result = Some(id)
},
Some(existing_id) => if id < existing_id {
result = Some(id)
None => result = Some((variable_index, precedence)),
Some((existing_id, existing_precedence)) => {
if precedence > existing_precedence
|| (precedence == existing_precedence && variable_index < existing_id)
{
result = Some((variable_index, precedence))
}
}
}
}
@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_successors() {
let table = [
(
vec![
(CharacterSet::empty().add_range('a', 'f'), 0, 1),
(CharacterSet::empty().add_range('d', 'i'), 1, 2),
],
vec![
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
(CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
],
),
(
vec![
(CharacterSet::empty().add_range('a', 'z'), 0, 1),
(CharacterSet::empty().add_char('d'), 0, 2),
(CharacterSet::empty().add_char('i'), 0, 3),
(CharacterSet::empty().add_char('f'), 0, 4),
],
vec![
(
CharacterSet::empty()
.add_range('a', 'c')
.add_char('e')
.add_range('g', 'h')
.add_range('j', 'z'),
0,
vec![1],
),
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
],
),
];
for row in table.iter() {
assert_eq!(
NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
row.1
);
}
// let successors = NfaCursor::group_successors(
// [
// (&CharacterSet::empty().add_range('a', 'f'), 1),
// (&CharacterSet::empty().add_range('d', 'i'), 2),
// ]
// .iter()
// .cloned(),
// );
//
// assert_eq!(
// successors,
// vec![
// (CharacterSet::empty().add_range('a', 'c'), vec![1],),
// (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
// (CharacterSet::empty().add_range('g', 'i'), vec![2],),
// ]
// );
}
#[test]
fn test_character_set_intersection() {
// whitelist - whitelist
// both sets contain 'c', 'd', and 'f'
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
// whitelist - blacklist
// both sets contain 'e', 'f', and 'm'
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
// blacklist - blacklist
// both sets exclude 'c', 'd', and 'e'
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Exclude(vec!['c', 'd', 'e'])
);
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
}
}

View file

@ -7,8 +7,18 @@ use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
};
pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut nfa = Nfa::new();
struct NfaBuilder {
nfa: Nfa,
is_sep: bool,
precedence_stack: Vec<i32>,
}
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder {
nfa: Nfa::new(),
is_sep: true,
precedence_stack: vec![0],
};
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
_ => false,
};
nfa.states.push(NfaState::Accept(i));
let last_state_id = nfa.last_state_id();
expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e {
Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)),
_ => e,
})?;
builder.is_sep = false;
builder.nfa.states.push(NfaState::Accept {
variable_index: i,
precedence: 0,
});
let last_state_id = builder.nfa.last_state_id();
builder
.expand_rule(&variable.rule, last_state_id)
.map_err(|e| match e {
Error::RegexError(msg) => {
Error::RegexError(format!("Rule {} {}", variable.name, msg))
}
_ => e,
})?;
if !is_immediate_token {
let last_state_id = nfa.last_state_id();
expand_rule(&separator_rule, &mut nfa, last_state_id, true)?;
builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id();
builder.expand_rule(&separator_rule, last_state_id)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
start_state: nfa.last_state_id(),
start_state: builder.nfa.last_state_id(),
});
}
Ok(LexicalGrammar { nfa, variables })
Ok(LexicalGrammar {
nfa: builder.nfa,
variables,
})
}
fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error::GrammarError(e.to_string()))?;
expand_regex(&ast, nfa, next_state_id, is_sep)
}
Rule::String(s) => {
for c in s.chars().rev() {
nfa.prepend(|last_state_id| NfaState::Advance {
chars: CharacterSet::empty().add_char(c),
state_id: last_state_id,
is_sep,
});
impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error::GrammarError(e.to_string()))?;
self.expand_regex(&ast, next_state_id)
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if expand_rule(element, nfa, next_state_id, is_sep)? {
alternative_state_ids.push(nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
Rule::String(s) => {
for c in s.chars().rev() {
self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id());
}
Ok(s.len() > 0)
}
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if expand_rule(element, nfa, next_state_id, is_sep)? {
result = true;
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
next_state_id = nfa.last_state_id();
}
Ok(result)
}
Rule::Repeat(rule) => {
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_state_id = nfa.last_state_id();
if expand_rule(rule, nfa, split_state_id, is_sep)? {
nfa.states[split_state_id as usize] =
NfaState::Split(nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Metadata { rule, .. } => {
// TODO - implement precedence
expand_rule(rule, nfa, next_state_id, is_sep)
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
nfa.states.push(NfaState::Accept(0)); // Placeholder for split
let split_state_id = nfa.last_state_id();
if expand_regex(&ast, nfa, split_state_id, is_sep)? {
nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id);
Ok(true)
} else {
nfa.states.pop();
Ok(false)
}
}
fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
if expand_regex(ast, nfa, next_state_id, is_sep)? {
nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
Ok(true)
} else {
Ok(false)
}
}
fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result<bool> {
if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? {
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
Ok(true)
} else {
Ok(false)
}
}
fn expand_count(
ast: &Ast,
count: u32,
nfa: &mut Nfa,
mut next_state_id: u32,
is_sep: bool,
) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if expand_regex(ast, nfa, next_state_id, is_sep)? {
result = true;
next_state_id = nfa.last_state_id();
}
}
Ok(result)
}
fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Literal(literal) => {
nfa.states.push(NfaState::Advance {
chars: CharacterSet::Include(vec![literal.c]),
state_id: next_state_id,
is_sep,
});
Ok(true)
}
Ast::Dot(_) => {
nfa.states.push(NfaState::Advance {
chars: CharacterSet::Exclude(vec!['\n']),
state_id: next_state_id,
is_sep,
});
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")),
Class::Perl(class) => {
nfa.states.push(NfaState::Advance {
chars: expand_perl_character_class(&class.kind),
state_id: next_state_id,
is_sep,
});
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let character_set = expand_character_class(&item)?;
nfa.states.push(NfaState::Advance {
chars: character_set,
state_id: next_state_id,
is_sep,
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.nfa.prepend(|last_state_id| {
NfaState::Split(last_state_id, alternative_state_id)
});
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
)),
},
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)
Ok(true)
}
RepetitionKind::OneOrMore => {
expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep)
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if self.expand_rule(element, next_state_id)? {
result = true;
}
next_state_id = self.nfa.last_state_id();
}
Ok(result)
}
RepetitionKind::ZeroOrMore => {
expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
expand_count(&repetition.ast, count, nfa, next_state_id, is_sep)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? {
expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)
Rule::Repeat(rule) => {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_rule(rule, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?;
for _ in min..max {
if result {
next_state_id = nfa.last_state_id();
Rule::Metadata { rule, params } => {
if let Some(precedence) = params.precedence {
self.precedence_stack.push(precedence);
}
let result = self.expand_rule(rule, next_state_id);
if params.precedence.is_some() {
self.precedence_stack.pop();
}
result
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id);
Ok(true)
}
Ast::Dot(_) => {
self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id);
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(_) => {
Err(Error::regex("Unicode character classes are not supported"))
}
Class::Perl(class) => {
self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
self.push_advance(self.expand_character_class(&item)?, next_state_id);
Ok(true)
}
if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? {
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
)),
},
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
self.expand_count(&repetition.ast, min, next_state_id)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
for _ in min..max {
if result {
next_state_id = self.nfa.last_state_id();
}
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
result = true;
}
}
Ok(result)
}
},
Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if self.expand_regex(&ast, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.nfa.prepend(|last_state_id| {
NfaState::Split(last_state_id, alternative_state_id)
});
}
Ok(true)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(&ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
},
Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if expand_regex(&ast, nfa, next_state_id, is_sep)? {
alternative_state_ids.push(nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.retain(|i| *i != nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id));
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_regex(&ast, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
self.nfa.states.pop();
Ok(false)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if expand_regex(&ast, nfa, next_state_id, is_sep)? {
result = true;
next_state_id = nfa.last_state_id();
}
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_regex(ast, next_state_id)? {
self.nfa
.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
Ok(true)
} else {
Ok(false)
}
}
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id)? {
self.nfa
.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
Ok(true)
} else {
Ok(false)
}
}
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if self.expand_regex(ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
}
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(&item)?);
}
Ok(result)
}
Ok(result)
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
_ => Err(Error::regex(&format!(
"Unsupported character class syntax {:?}",
item
))),
}
}
}
fn expand_character_class(item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(expand_character_class(&item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)),
_ => Err(Error::regex(&format!(
"Unsupported character class syntax {:?}",
item
))),
}
}
fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap();
self.add_precedence(precedence, vec![state_id]);
self.nfa.states.push(NfaState::Advance {
chars,
state_id,
precedence,
is_sep: self.is_sep,
});
}
fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
let mut i = 0;
while i < state_ids.len() {
let state_id = state_ids[i];
let (left, right) = match &mut self.nfa.states[state_id as usize] {
NfaState::Accept {precedence, ..} => {
*precedence = prec;
return;
},
NfaState::Split(left, right) => (*left, *right),
_ => return
};
if !state_ids.contains(&left) {
state_ids.push(left);
}
if !state_ids.contains(&right) {
state_ids.push(right);
}
i += 1;
}
}
}
@ -313,11 +367,15 @@ mod tests {
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
let mut result = None;
let mut result_precedence = 0;
let mut start_char = 0;
let mut end_char = 0;
for c in s.chars() {
if let Some(id) = cursor.finished_id() {
result = Some((id, &s[start_char..end_char]));
if let Some((id, finished_precedence)) = cursor.finished_id() {
if result.is_none() || result_precedence <= finished_precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = finished_precedence;
}
}
if cursor.advance(c) {
end_char += 1;
@ -329,8 +387,11 @@ mod tests {
}
}
if let Some(id) = cursor.finished_id() {
result = Some((id, &s[start_char..end_char]));
if let Some((id, finished_precedence)) = cursor.finished_id() {
if result.is_none() || result_precedence <= finished_precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = finished_precedence;
}
}
result
@ -443,6 +504,20 @@ mod tests {
(" \\\na", Some((0, "a"))),
],
},
// shorter tokens with higher precedence
Row {
rules: vec![
Rule::prec(2, Rule::pattern("abc")),
Rule::prec(1, Rule::pattern("ab[cd]e")),
Rule::pattern("[a-e]+"),
],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
("abceef", Some((0, "abc"))),
("abdeef", Some((1, "abde"))),
("aeeeef", Some((2, "aeeee"))),
],
},
];
for Row {

View file

@ -7,7 +7,7 @@ mod intern_symbols;
mod process_inlines;
use self::expand_repeats::expand_repeats;
use self::expand_tokens::expand_tokens;
pub(crate) use self::expand_tokens::expand_tokens;
use self::extract_simple_aliases::extract_simple_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
@ -19,7 +19,7 @@ use crate::grammars::{
};
use crate::rules::{AliasMap, Rule, Symbol};
pub(self) struct IntermediateGrammar<T, U> {
pub(crate) struct IntermediateGrammar<T, U> {
variables: Vec<Variable>,
extra_tokens: Vec<T>,
expected_conflicts: Vec<Vec<Symbol>>,
@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar<T, U> {
word_token: Option<Symbol>,
}
pub(self) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
#[derive(Debug, PartialEq, Eq)]
pub(self) struct ExtractedLexicalGrammar {
variables: Vec<Variable>,
separators: Vec<Rule>,
pub(crate) struct ExtractedLexicalGrammar {
pub variables: Vec<Variable>,
pub separators: Vec<Rule>,
}
pub(crate) fn prepare_grammar(