From 479400e5d3e7fdc1395868c0f19fe6415cb68bda Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:56:00 -0800 Subject: [PATCH] Add handling of precedence within tokens --- src/nfa.rs | 366 +++++++++++++++++- src/prepare_grammar/expand_tokens.rs | 557 +++++++++++++++------------ src/prepare_grammar/mod.rs | 14 +- 3 files changed, 670 insertions(+), 267 deletions(-) diff --git a/src/nfa.rs b/src/nfa.rs index f6acb67a..4a4fa17b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -1,5 +1,8 @@ -use std::fmt; use std::char; +use std::cmp::max; +use std::cmp::Ordering; +use std::fmt; +use std::mem::swap; #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum CharacterSet { @@ -13,14 +16,18 @@ pub enum NfaState { chars: CharacterSet, state_id: u32, is_sep: bool, + precedence: i32, }, Split(u32, u32), - Accept(usize), + Accept { + variable_index: usize, + precedence: i32, + }, } #[derive(PartialEq, Eq)] pub struct Nfa { - pub states: Vec + pub states: Vec, } impl Default for Nfa { @@ -78,14 +85,57 @@ impl CharacterSet { } } - pub fn add(self, other: CharacterSet) -> Self { - if let (CharacterSet::Include(mut chars), CharacterSet::Include(other_chars)) = (self, other) { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - CharacterSet::Include(chars) + pub fn add(self, other: &CharacterSet) -> Self { + if let CharacterSet::Include(other_chars) = other { + if let CharacterSet::Include(mut chars) = self { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + return CharacterSet::Include(chars); + } + } + panic!("Called add with a negated character set"); + } + + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => { + CharacterSet::Include(remove_chars(chars, other_chars, true)) + } + CharacterSet::Exclude(other_chars) => { + let mut removed = remove_chars(chars, other_chars, false); + add_chars(other_chars, chars); + swap(&mut removed, chars); + CharacterSet::Include(removed) + } + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => { + let mut removed = remove_chars(other_chars, chars, false); + add_chars(chars, other_chars); + swap(&mut removed, other_chars); + CharacterSet::Include(removed) + } + CharacterSet::Exclude(other_chars) => { + let removed = remove_chars(chars, other_chars, true); + let mut included_characters = Vec::new(); + let mut other_included_characters = Vec::new(); + swap(&mut included_characters, other_chars); + swap(&mut other_included_characters, chars); + *self = CharacterSet::Include(included_characters); + *other = CharacterSet::Include(other_included_characters); + CharacterSet::Exclude(removed) + } + }, + } + } + + pub fn is_empty(&self) -> bool { + if let CharacterSet::Include(c) = self { + c.is_empty() } else { - panic!("Called add with a negated character set"); + false } } @@ -97,6 +147,84 @@ impl CharacterSet { } } +impl Ord for CharacterSet { + fn cmp(&self, other: &CharacterSet) -> Ordering { + match self { + CharacterSet::Include(chars) => { + if let CharacterSet::Include(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Less + } + } + CharacterSet::Exclude(chars) => { + if let CharacterSet::Exclude(other_chars) = other { + compare_chars(chars, other_chars) + } else { + Ordering::Greater + } + } + } + } +} + +impl PartialOrd for CharacterSet { + fn partial_cmp(&self, other: &CharacterSet) -> Option { + Some(self.cmp(other)) + } +} + +fn add_chars(left: &mut Vec, right: &Vec) { + for c in right { + match left.binary_search(c) { + Err(i) => left.insert(i, *c), + _ => {} + } + } +} + +fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) -> Vec { + let mut result = Vec::new(); + right.retain(|right_char| { + if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) { + left.remove(index); + result.push(*right_char); + false || !mutate_right + } else { + true + } + }); + result +} + +fn compare_chars(chars: &Vec, other_chars: &Vec) -> Ordering { + if chars.is_empty() { + if other_chars.is_empty() { + Ordering::Equal + } else { + Ordering::Less + } + } else if other_chars.is_empty() { + Ordering::Greater + } else { + let mut other_c = other_chars.iter(); + for c in chars.iter() { + if let Some(other_c) = other_c.next() { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } + } else { + return Ordering::Greater; + } + } + if other_c.next().is_some() { + return Ordering::Less; + } + Ordering::Equal + } +} + impl Nfa { pub fn new() -> Self { Nfa { states: Vec::new() } @@ -124,17 +252,32 @@ impl fmt::Debug for Nfa { impl<'a> NfaCursor<'a> { pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { - let mut result = Self { nfa, state_ids: Vec::new(), in_sep: true }; + let mut result = Self { + nfa, + state_ids: Vec::new(), + in_sep: true, + }; result.add_states(&mut states); result } + pub fn reset(&mut self, mut states: Vec) { + self.state_ids.clear(); + self.add_states(&mut states); + } + pub fn advance(&mut self, c: char) -> bool { let mut result = false; let mut new_state_ids = Vec::new(); let mut any_sep_transitions = false; for current_state_id in &self.state_ids { - if let NfaState::Advance { chars, state_id, is_sep } = &self.nfa.states[*current_state_id as usize] { + if let NfaState::Advance { + chars, + state_id, + is_sep, + .. + } = &self.nfa.states[*current_state_id as usize] + { if chars.contains(c) { if *is_sep { any_sep_transitions = true; @@ -152,16 +295,68 @@ impl<'a> NfaCursor<'a> { result } - pub fn finished_id(&self) -> Option { + pub fn successors(&self) -> impl Iterator { + self.state_ids.iter().filter_map(move |id| { + if let NfaState::Advance { + chars, + state_id, + precedence, + .. + } = &self.nfa.states[*id as usize] + { + Some((chars, *precedence, *state_id)) + } else { + None + } + }) + } + + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + Self::group_successors(self.successors()) + } + + fn group_successors<'b>( + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec)> { + let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); + for (chars, prec, state) in iter { + let mut chars = chars.clone(); + let mut i = 0; + while i < result.len() { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let mut precedence = result[i].1; + states.push(state); + result.insert(i, (intersection, max(precedence, prec), states)); + i += 1; + } + i += 1; + } + if !chars.is_empty() { + result.push((chars, prec, vec![state])); + } + } + result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + result + } + + pub fn finished_id(&self) -> Option<(usize, i32)> { let mut result = None; for state_id in self.state_ids.iter() { - if let NfaState::Accept(id) = self.nfa.states[*state_id as usize] { + if let NfaState::Accept { + variable_index, + precedence, + } = self.nfa.states[*state_id as usize] + { match result { - None => { - result = Some(id) - }, - Some(existing_id) => if id < existing_id { - result = Some(id) + None => result = Some((variable_index, precedence)), + Some((existing_id, existing_precedence)) => { + if precedence > existing_precedence + || (precedence == existing_precedence && variable_index < existing_id) + { + result = Some((variable_index, precedence)) + } } } } @@ -202,3 +397,136 @@ impl<'a> NfaCursor<'a> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_group_successors() { + let table = [ + ( + vec![ + (CharacterSet::empty().add_range('a', 'f'), 0, 1), + (CharacterSet::empty().add_range('d', 'i'), 1, 2), + ], + vec![ + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), + (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + ], + ), + ( + vec![ + (CharacterSet::empty().add_range('a', 'z'), 0, 1), + (CharacterSet::empty().add_char('d'), 0, 2), + (CharacterSet::empty().add_char('i'), 0, 3), + (CharacterSet::empty().add_char('f'), 0, 4), + ], + vec![ + ( + CharacterSet::empty() + .add_range('a', 'c') + .add_char('e') + .add_range('g', 'h') + .add_range('j', 'z'), + 0, + vec![1], + ), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + ], + ), + ]; + + for row in table.iter() { + assert_eq!( + NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + row.1 + ); + } + + // let successors = NfaCursor::group_successors( + // [ + // (&CharacterSet::empty().add_range('a', 'f'), 1), + // (&CharacterSet::empty().add_range('d', 'i'), 2), + // ] + // .iter() + // .cloned(), + // ); + // + // assert_eq!( + // successors, + // vec![ + // (CharacterSet::empty().add_range('a', 'c'), vec![1],), + // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), + // (CharacterSet::empty().add_range('g', 'i'), vec![2],), + // ] + // ); + } + + #[test] + fn test_character_set_intersection() { + // whitelist - whitelist + // both sets contain 'c', 'd', and 'f' + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + // whitelist - blacklist + // both sets contain 'e', 'f', and 'm' + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + // blacklist - blacklist + // both sets exclude 'c', 'd', and 'e' + let mut a = CharacterSet::empty().add_range('a', 'e').negate(); + let mut b = CharacterSet::empty().add_range('c', 'h').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Exclude(vec!['c', 'd', 'e']) + ); + assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); + assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + } +} diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 5ee9861f..b0d2ae04 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -7,8 +7,18 @@ use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; -pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { - let mut nfa = Nfa::new(); +struct NfaBuilder { + nfa: Nfa, + is_sep: bool, + precedence_stack: Vec, +} + +pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut builder = NfaBuilder { + nfa: Nfa::new(), + is_sep: true, + precedence_stack: vec![0], + }; let separator_rule = if grammar.separators.len() > 0 { grammar.separators.push(Rule::Blank); @@ -24,281 +34,325 @@ pub(super) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result false, }; - nfa.states.push(NfaState::Accept(i)); - let last_state_id = nfa.last_state_id(); - expand_rule(&variable.rule, &mut nfa, last_state_id, false).map_err(|e| match e { - Error::RegexError(msg) => Error::RegexError(format!("Rule {} {}", variable.name, msg)), - _ => e, - })?; + builder.is_sep = false; + builder.nfa.states.push(NfaState::Accept { + variable_index: i, + precedence: 0, + }); + let last_state_id = builder.nfa.last_state_id(); + builder + .expand_rule(&variable.rule, last_state_id) + .map_err(|e| match e { + Error::RegexError(msg) => { + Error::RegexError(format!("Rule {} {}", variable.name, msg)) + } + _ => e, + })?; if !is_immediate_token { - let last_state_id = nfa.last_state_id(); - expand_rule(&separator_rule, &mut nfa, last_state_id, true)?; + builder.is_sep = true; + let last_state_id = builder.nfa.last_state_id(); + builder.expand_rule(&separator_rule, last_state_id)?; } variables.push(LexicalVariable { name: variable.name, kind: variable.kind, - start_state: nfa.last_state_id(), + start_state: builder.nfa.last_state_id(), }); } - Ok(LexicalGrammar { nfa, variables }) + Ok(LexicalGrammar { + nfa: builder.nfa, + variables, + }) } -fn expand_rule(rule: &Rule, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match rule { - Rule::Pattern(s) => { - let ast = parse::Parser::new() - .parse(&s) - .map_err(|e| Error::GrammarError(e.to_string()))?; - expand_regex(&ast, nfa, next_state_id, is_sep) - } - Rule::String(s) => { - for c in s.chars().rev() { - nfa.prepend(|last_state_id| NfaState::Advance { - chars: CharacterSet::empty().add_char(c), - state_id: last_state_id, - is_sep, - }); +impl NfaBuilder { + fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error::GrammarError(e.to_string()))?; + self.expand_regex(&ast, next_state_id) } - Ok(s.len() > 0) - } - Rule::Choice(elements) => { - let mut alternative_state_ids = Vec::new(); - for element in elements { - if expand_rule(element, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); + Rule::String(s) => { + for c in s.chars().rev() { + self.push_advance(CharacterSet::empty().add_char(c), self.nfa.last_state_id()); } + Ok(s.len() > 0) } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } - Ok(true) - } - Rule::Seq(elements) => { - let mut result = false; - for element in elements.into_iter().rev() { - if expand_rule(element, nfa, next_state_id, is_sep)? { - result = true; + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if self.expand_rule(element, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } } - next_state_id = nfa.last_state_id(); - } - Ok(result) - } - Rule::Repeat(rule) => { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_rule(rule, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = - NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - Ok(false) - } - } - Rule::Metadata { rule, .. } => { - // TODO - implement precedence - expand_rule(rule, nfa, next_state_id, is_sep) - } - Rule::Blank => Ok(false), - _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), - } -} - -fn expand_one_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - nfa.states.push(NfaState::Accept(0)); // Placeholder for split - let split_state_id = nfa.last_state_id(); - if expand_regex(&ast, nfa, split_state_id, is_sep)? { - nfa.states[split_state_id as usize] = NfaState::Split(nfa.last_state_id(), next_state_id); - Ok(true) - } else { - nfa.states.pop(); - Ok(false) - } -} - -fn expand_zero_or_one(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_zero_or_more(ast: &Ast, nfa: &mut Nfa, next_state_id: u32, is_sep: bool) -> Result { - if expand_one_or_more(&ast, nfa, next_state_id, is_sep)? { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); - Ok(true) - } else { - Ok(false) - } -} - -fn expand_count( - ast: &Ast, - count: u32, - nfa: &mut Nfa, - mut next_state_id: u32, - is_sep: bool, -) -> Result { - let mut result = false; - for _ in 0..count { - if expand_regex(ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); - } - } - Ok(result) -} - -fn expand_regex(ast: &Ast, nfa: &mut Nfa, mut next_state_id: u32, is_sep: bool) -> Result { - match ast { - Ast::Empty(_) => Ok(false), - Ast::Flags(_) => Err(Error::regex("Flags are not supported")), - Ast::Literal(literal) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Include(vec![literal.c]), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Dot(_) => { - nfa.states.push(NfaState::Advance { - chars: CharacterSet::Exclude(vec!['\n']), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), - Ast::Class(class) => match class { - Class::Unicode(_) => Err(Error::regex("Unicode character classes are not supported")), - Class::Perl(class) => { - nfa.states.push(NfaState::Advance { - chars: expand_perl_character_class(&class.kind), - state_id: next_state_id, - is_sep, - }); - Ok(true) - } - Class::Bracketed(class) => match &class.kind { - ClassSet::Item(item) => { - let character_set = expand_character_class(&item)?; - nfa.states.push(NfaState::Advance { - chars: character_set, - state_id: next_state_id, - is_sep, + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) }); - Ok(true) } - ClassSet::BinaryOp(_) => Err(Error::regex( - "Binary operators in character classes aren't supported", - )), - }, - }, - Ast::Repetition(repetition) => match repetition.op.kind { - RepetitionKind::ZeroOrOne => { - expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep) + Ok(true) } - RepetitionKind::OneOrMore => { - expand_one_or_more(&repetition.ast, nfa, next_state_id, is_sep) + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if self.expand_rule(element, next_state_id)? { + result = true; + } + next_state_id = self.nfa.last_state_id(); + } + Ok(result) } - RepetitionKind::ZeroOrMore => { - expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::Exactly(count)) => { - expand_count(&repetition.ast, count, nfa, next_state_id, is_sep) - } - RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { - if expand_zero_or_more(&repetition.ast, nfa, next_state_id, is_sep)? { - expand_count(&repetition.ast, min, nfa, next_state_id, is_sep) + Rule::Repeat(rule) => { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_rule(rule, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); + Ok(true) } else { Ok(false) } } - RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { - let mut result = expand_count(&repetition.ast, min, nfa, next_state_id, is_sep)?; - for _ in min..max { - if result { - next_state_id = nfa.last_state_id(); + Rule::Metadata { rule, params } => { + if let Some(precedence) = params.precedence { + self.precedence_stack.push(precedence); + } + let result = self.expand_rule(rule, next_state_id); + if params.precedence.is_some() { + self.precedence_stack.pop(); + } + result + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + } + } + + fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result { + match ast { + Ast::Empty(_) => Ok(false), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id); + Ok(true) + } + Ast::Dot(_) => { + self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id); + Ok(true) + } + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => { + Err(Error::regex("Unicode character classes are not supported")) + } + Class::Perl(class) => { + self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + Ok(true) + } + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + self.push_advance(self.expand_character_class(&item)?, next_state_id); + Ok(true) } - if expand_zero_or_one(&repetition.ast, nfa, next_state_id, is_sep)? { + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + self.expand_zero_or_one(&repetition.ast, next_state_id) + } + RepetitionKind::OneOrMore => { + self.expand_one_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::ZeroOrMore => { + self.expand_zero_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + self.expand_count(&repetition.ast, count, next_state_id) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if self.expand_zero_or_more(&repetition.ast, next_state_id)? { + self.expand_count(&repetition.ast, min, next_state_id) + } else { + Ok(false) + } + } + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = self.expand_count(&repetition.ast, min, next_state_id)?; + for _ in min..max { + if result { + next_state_id = self.nfa.last_state_id(); + } + if self.expand_zero_or_one(&repetition.ast, next_state_id)? { + result = true; + } + } + Ok(result) + } + }, + Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()), + Ast::Alternation(alternation) => { + let mut alternative_state_ids = Vec::new(); + for ast in alternation.asts.iter() { + if self.expand_regex(&ast, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + + for alternative_state_id in alternative_state_ids { + self.nfa.prepend(|last_state_id| { + NfaState::Split(last_state_id, alternative_state_id) + }); + } + Ok(true) + } + Ast::Concat(concat) => { + let mut result = false; + for ast in concat.asts.iter().rev() { + if self.expand_regex(&ast, next_state_id)? { result = true; + next_state_id = self.nfa.last_state_id(); } } Ok(result) } - }, - Ast::Group(group) => expand_regex(&group.ast, nfa, nfa.last_state_id(), is_sep), - Ast::Alternation(alternation) => { - let mut alternative_state_ids = Vec::new(); - for ast in alternation.asts.iter() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - alternative_state_ids.push(nfa.last_state_id()); - } else { - alternative_state_ids.push(next_state_id); - } - } - alternative_state_ids.retain(|i| *i != nfa.last_state_id()); - for alternative_state_id in alternative_state_ids { - nfa.prepend(|last_state_id| NfaState::Split(last_state_id, alternative_state_id)); - } + } + } + + fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_regex(&ast, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); Ok(true) + } else { + self.nfa.states.pop(); + Ok(false) } - Ast::Concat(concat) => { - let mut result = false; - for ast in concat.asts.iter().rev() { - if expand_regex(&ast, nfa, next_state_id, is_sep)? { - result = true; - next_state_id = nfa.last_state_id(); + } + + fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_regex(ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_one_or_more(&ast, next_state_id)? { + self.nfa + .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id)); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result { + let mut result = false; + for _ in 0..count { + if self.expand_regex(ast, next_state_id)? { + result = true; + next_state_id = self.nfa.last_state_id(); + } + } + Ok(result) + } + + fn expand_character_class(&self, item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(&self.expand_character_class(&item)?); } + Ok(result) } - Ok(result) + ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), } } -} -fn expand_character_class(item: &ClassSetItem) -> Result { - match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), } - ClassSetItem::Union(union) => { - let mut result = CharacterSet::empty(); - for item in &union.items { - result = result.add(expand_character_class(&item)?); - } - Ok(result) - } - ClassSetItem::Perl(class) => Ok(expand_perl_character_class(&class.kind)), - _ => Err(Error::regex(&format!( - "Unsupported character class syntax {:?}", - item - ))), } -} -fn expand_perl_character_class(item: &ClassPerlKind) -> CharacterSet { - match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), - ClassPerlKind::Space => CharacterSet::empty() - .add_char(' ') - .add_char('\t') - .add_char('\r') - .add_char('\n'), - ClassPerlKind::Word => CharacterSet::empty() - .add_char('_') - .add_range('A', 'Z') - .add_range('a', 'z') - .add_range('0', '9'), + fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { + let precedence = *self.precedence_stack.last().unwrap(); + self.add_precedence(precedence, vec![state_id]); + self.nfa.states.push(NfaState::Advance { + chars, + state_id, + precedence, + is_sep: self.is_sep, + }); + } + + fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { + let mut i = 0; + while i < state_ids.len() { + let state_id = state_ids[i]; + let (left, right) = match &mut self.nfa.states[state_id as usize] { + NfaState::Accept {precedence, ..} => { + *precedence = prec; + return; + }, + NfaState::Split(left, right) => (*left, *right), + _ => return + }; + if !state_ids.contains(&left) { + state_ids.push(left); + } + if !state_ids.contains(&right) { + state_ids.push(right); + } + i += 1; + } } } @@ -313,11 +367,15 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; + let mut result_precedence = 0; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } if cursor.advance(c) { end_char += 1; @@ -329,8 +387,11 @@ mod tests { } } - if let Some(id) = cursor.finished_id() { - result = Some((id, &s[start_char..end_char])); + if let Some((id, finished_precedence)) = cursor.finished_id() { + if result.is_none() || result_precedence <= finished_precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = finished_precedence; + } } result @@ -443,6 +504,20 @@ mod tests { (" \\\na", Some((0, "a"))), ], }, + // shorter tokens with higher precedence + Row { + rules: vec![ + Rule::prec(2, Rule::pattern("abc")), + Rule::prec(1, Rule::pattern("ab[cd]e")), + Rule::pattern("[a-e]+"), + ], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], + examples: vec![ + ("abceef", Some((0, "abc"))), + ("abdeef", Some((1, "abde"))), + ("aeeeef", Some((2, "aeeee"))), + ], + }, ]; for Row { diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index f325383b..b0c1d2a3 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -7,7 +7,7 @@ mod intern_symbols; mod process_inlines; use self::expand_repeats::expand_repeats; -use self::expand_tokens::expand_tokens; +pub(crate) use self::expand_tokens::expand_tokens; use self::extract_simple_aliases::extract_simple_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; @@ -19,7 +19,7 @@ use crate::grammars::{ }; use crate::rules::{AliasMap, Rule, Symbol}; -pub(self) struct IntermediateGrammar { +pub(crate) struct IntermediateGrammar { variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, @@ -28,14 +28,14 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(crate) type InternedGrammar = IntermediateGrammar; -pub(self) type ExtractedSyntaxGrammar = IntermediateGrammar; +pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar; #[derive(Debug, PartialEq, Eq)] -pub(self) struct ExtractedLexicalGrammar { - variables: Vec, - separators: Vec, +pub(crate) struct ExtractedLexicalGrammar { + pub variables: Vec, + pub separators: Vec, } pub(crate) fn prepare_grammar(