From ab78ab3f9b82cb405ca279031f511903175c6015 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 28 Jan 2021 14:58:29 -0800 Subject: [PATCH] Represent CharacterSet internally as a vector of ranges --- cli/src/generate/build_tables/mod.rs | 6 +- cli/src/generate/nfa.rs | 718 +++++++++--------- .../generate/prepare_grammar/expand_tokens.rs | 14 +- cli/src/generate/render.rs | 30 +- 4 files changed, 403 insertions(+), 365 deletions(-) diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 2e5d2f57..fba0fef3 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -13,7 +13,7 @@ use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::{CharacterSet, NfaCursor}; +use crate::generate::nfa::NfaCursor; use crate::generate::node_types::VariableInfo; use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; @@ -472,10 +472,8 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { true - } else if let CharacterSet::Include(chars) = chars { - chars.iter().all(|c| c.is_alphabetic() || *c == '_') } else { - false + chars.chars().all(|c| c.is_alphabetic() || c == '_') } }) } diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 4cbfaaa3..99f595d0 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -6,10 +6,9 @@ use std::fmt; use std::mem::swap; use std::ops::Range; -#[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub enum CharacterSet { - Include(Vec), - Exclude(Vec), +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct CharacterSet { + ranges: Vec>, } #[derive(Debug, PartialEq, Eq)] @@ -52,142 +51,233 @@ impl Default for Nfa { } } +const END: u32 = char::MAX as u32 + 1; + impl CharacterSet { pub fn empty() -> Self { - CharacterSet::Include(Vec::new()) + CharacterSet { ranges: Vec::new() } } - pub fn negate(self) -> CharacterSet { - match self { - CharacterSet::Include(chars) => CharacterSet::Exclude(chars), - CharacterSet::Exclude(chars) => CharacterSet::Include(chars), + pub fn from_range(mut first: char, mut last: char) -> Self { + if first > last { + swap(&mut first, &mut last); + } + CharacterSet { + ranges: vec![(first as u32)..(last as u32 + 1)], } } - pub fn add_char(self, c: char) -> Self { - if let CharacterSet::Include(mut chars) = self { - if let Err(i) = chars.binary_search(&c) { - chars.insert(i, c); + pub fn from_char(c: char) -> Self { + CharacterSet { + ranges: vec![(c as u32)..(c as u32 + 1)], + } + } + + pub fn negate(mut self) -> CharacterSet { + let mut i = 0; + let mut previous_end = 0; + while i < self.ranges.len() { + let range = &mut self.ranges[i]; + let start = previous_end; + previous_end = range.end; + if start < range.start { + self.ranges[i] = start..range.start; + i += 1; + } else { + self.ranges.remove(i); } - CharacterSet::Include(chars) - } else { - panic!("Called add with a negated character set"); } + if previous_end < END { + self.ranges.push(previous_end..END); + } + self } - pub fn add_range(self, start: char, end: char) -> Self { - if let CharacterSet::Include(mut chars) = self { - let mut c = start as u32; - while c <= end as u32 { - chars.push(char::from_u32(c).unwrap()); - c += 1; + pub fn add_char(mut self, c: char) -> Self { + self.add_int_range(0, c as u32, c as u32 + 1); + self + } + + pub fn add_range(mut self, start: char, end: char) -> Self { + self.add_int_range(0, start as u32, end as u32 + 1); + self + } + + pub fn add(mut self, other: &CharacterSet) -> Self { + let mut index = 0; + for range in &other.ranges { + index = self.add_int_range(index, range.start as u32, range.end as u32); + } + self + } + + fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize { + while i < self.ranges.len() { + let range = &mut self.ranges[i]; + if range.start > end { + self.ranges.insert(i, start..end); + return i; } - chars.sort_unstable(); - chars.dedup(); - CharacterSet::Include(chars) - } else { - panic!("Called add with a negated character set"); - } - } - - pub fn add(self, other: &CharacterSet) -> Self { - match self { - CharacterSet::Include(mut chars) => match other { - CharacterSet::Include(other_chars) => { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - CharacterSet::Include(chars) - } - CharacterSet::Exclude(other_chars) => { - let excluded_chars = other_chars - .iter() - .cloned() - .filter(|c| !chars.contains(&c)) - .collect(); - CharacterSet::Exclude(excluded_chars) - } - }, - CharacterSet::Exclude(mut chars) => match other { - CharacterSet::Include(other_chars) => { - chars.retain(|c| !other_chars.contains(&c)); - CharacterSet::Exclude(chars) - } - CharacterSet::Exclude(other_chars) => { - chars.retain(|c| other_chars.contains(&c)); - CharacterSet::Exclude(chars) - } - }, + if range.end >= start { + range.end = range.end.max(end); + range.start = range.start.min(start); + return i; + } + i += 1; } + self.ranges.push(start..end); + i } pub fn does_intersect(&self, other: &CharacterSet) -> bool { - match self { - CharacterSet::Include(chars) => match other { - CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common, - CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only, - }, - CharacterSet::Exclude(chars) => match other { - CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only, - CharacterSet::Exclude(_) => true, - }, + let mut left_ranges = self.ranges.iter(); + let mut right_ranges = other.ranges.iter(); + let mut left_range = left_ranges.next(); + let mut right_range = right_ranges.next(); + while let (Some(left), Some(right)) = (&left_range, &right_range) { + if left.end <= right.start { + left_range = left_ranges.next(); + } else if left.start >= right.end { + right_range = right_ranges.next(); + } else { + return true; + } } + false } pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { - match self { - CharacterSet::Include(chars) => match other { - CharacterSet::Include(other_chars) => { - CharacterSet::Include(remove_chars(chars, other_chars, true)) + let mut intersection = Vec::new(); + let mut left_i = 0; + let mut right_i = 0; + while left_i < self.ranges.len() && right_i < other.ranges.len() { + let left = &mut self.ranges[left_i]; + let right = &mut other.ranges[right_i]; + + match left.start.cmp(&right.start) { + Ordering::Less => { + // [ L ] + // [ R ] + if left.end <= right.start { + left_i += 1; + continue; + } + + match left.end.cmp(&right.end) { + // [ L ] + // [ R ] + Ordering::Less => { + intersection.push(right.start..left.end); + swap(&mut left.end, &mut right.start); + left_i += 1; + } + + // [ L ] + // [ R ] + Ordering::Equal => { + intersection.push(right.clone()); + left.end = right.start; + other.ranges.remove(right_i); + } + + // [ L ] + // [ R ] + Ordering::Greater => { + intersection.push(right.clone()); + let new_range = left.start..right.start; + left.start = right.end; + self.ranges.insert(left_i, new_range); + other.ranges.remove(right_i); + left_i += 1; + } + } } - CharacterSet::Exclude(other_chars) => { - let mut removed = remove_chars(chars, other_chars, false); - add_chars(other_chars, chars); - swap(&mut removed, chars); - CharacterSet::Include(removed) + Ordering::Equal => { + // [ L ] + // [ R ] + if left.end < right.end { + intersection.push(left.start..left.end); + right.start = left.end; + self.ranges.remove(left_i); + } + // [ L ] + // [ R ] + else if left.end == right.end { + intersection.push(left.clone()); + self.ranges.remove(left_i); + other.ranges.remove(right_i); + } + // [ L ] + // [ R ] + else if left.end > right.end { + intersection.push(right.clone()); + left.start = right.end; + other.ranges.remove(right_i); + } } - }, - CharacterSet::Exclude(chars) => match other { - CharacterSet::Include(other_chars) => { - let mut removed = remove_chars(other_chars, chars, false); - add_chars(chars, other_chars); - swap(&mut removed, other_chars); - CharacterSet::Include(removed) + Ordering::Greater => { + // [ L ] + // [ R ] + if left.start >= right.end { + right_i += 1; + continue; + } + + match left.end.cmp(&right.end) { + // [ L ] + // [ R ] + Ordering::Less => { + intersection.push(left.clone()); + let new_range = right.start..left.start; + right.start = left.end; + other.ranges.insert(right_i, new_range); + self.ranges.remove(left_i); + right_i += 1; + } + + // [ L ] + // [ R ] + Ordering::Equal => { + intersection.push(left.clone()); + right.end = left.start; + self.ranges.remove(left_i); + } + + // [ L ] + // [ R ] + Ordering::Greater => { + intersection.push(left.start..right.end); + swap(&mut left.start, &mut right.end); + right_i += 1; + } + } } - CharacterSet::Exclude(other_chars) => { - let mut result_exclusion = chars.clone(); - result_exclusion.extend(other_chars.iter().cloned()); - result_exclusion.sort_unstable(); - result_exclusion.dedup(); - remove_chars(chars, other_chars, true); - let mut included_characters = Vec::new(); - let mut other_included_characters = Vec::new(); - swap(&mut included_characters, other_chars); - swap(&mut other_included_characters, chars); - *self = CharacterSet::Include(included_characters); - *other = CharacterSet::Include(other_included_characters); - CharacterSet::Exclude(result_exclusion) - } - }, + } } + CharacterSet { + ranges: intersection, + } + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.ranges.iter().flat_map(|r| r.clone()) + } + + pub fn chars<'a>(&'a self) -> impl Iterator + 'a { + self.iter().filter_map(char::from_u32) } pub fn is_empty(&self) -> bool { - if let CharacterSet::Include(c) = self { - c.is_empty() - } else { - false - } + self.ranges.is_empty() } - pub fn ranges<'a>( - chars: &'a Vec, + pub fn simplify_ignoring<'a>( + &'a self, ruled_out_characters: &'a HashSet, - ) -> impl Iterator> + 'a { + ) -> Vec> { let mut prev_range: Option> = None; - chars - .iter() - .map(|c| (*c, false)) + self.chars() + .map(|c| (c, false)) .chain(Some(('\0', true))) .filter_map(move |(c, done)| { if done { @@ -212,35 +302,40 @@ impl CharacterSet { None } }) + .collect() } - #[cfg(test)] pub fn contains(&self, c: char) -> bool { - match self { - CharacterSet::Include(chars) => chars.contains(&c), - CharacterSet::Exclude(chars) => !chars.contains(&c), - } + self.ranges.iter().any(|r| r.contains(&(c as u32))) } } impl Ord for CharacterSet { fn cmp(&self, other: &CharacterSet) -> Ordering { - match self { - CharacterSet::Include(chars) => { - if let CharacterSet::Include(other_chars) = other { - order_chars(chars, other_chars) - } else { - Ordering::Less - } + let count_cmp = self + .ranges + .iter() + .map(|r| r.len()) + .sum::() + .cmp(&other.ranges.iter().map(|r| r.len()).sum()); + if count_cmp != Ordering::Equal { + return count_cmp; + } + + for (left_range, right_range) in self.ranges.iter().zip(other.ranges.iter()) { + let cmp = left_range.len().cmp(&right_range.len()); + if cmp != Ordering::Equal { + return cmp; } - CharacterSet::Exclude(chars) => { - if let CharacterSet::Exclude(other_chars) = other { - order_chars(chars, other_chars) - } else { - Ordering::Greater + + for (left, right) in left_range.clone().zip(right_range.clone()) { + let cmp = left.cmp(&right); + if cmp != Ordering::Equal { + return cmp; } } } + return Ordering::Equal; } } @@ -250,89 +345,22 @@ impl PartialOrd for CharacterSet { } } -fn add_chars(left: &mut Vec, right: &Vec) { - for c in right { - match left.binary_search(c) { - Err(i) => left.insert(i, *c), - _ => {} +impl fmt::Debug for CharacterSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CharacterSet [")?; + let mut set = self.clone(); + if self.contains(char::MAX) { + write!(f, "^ ")?; + set = set.negate(); } - } -} - -fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) -> Vec { - let mut result = Vec::new(); - right.retain(|right_char| { - if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) { - left.remove(index); - result.push(*right_char); - false || !mutate_right - } else { - true - } - }); - result -} - -struct SetComparision { - left_only: bool, - common: bool, - right_only: bool, -} - -fn compare_chars(left: &Vec, right: &Vec) -> SetComparision { - let mut result = SetComparision { - left_only: false, - common: false, - right_only: false, - }; - let mut left = left.iter().cloned(); - let mut right = right.iter().cloned(); - let mut i = left.next(); - let mut j = right.next(); - while let (Some(left_char), Some(right_char)) = (i, j) { - if left_char < right_char { - i = left.next(); - result.left_only = true; - } else if left_char > right_char { - j = right.next(); - result.right_only = true; - } else { - i = left.next(); - j = right.next(); - result.common = true; - } - } - - match (i, j) { - (Some(_), _) => result.left_only = true, - (_, Some(_)) => result.right_only = true, - _ => {} - } - - result -} - -fn order_chars(chars: &Vec, other_chars: &Vec) -> Ordering { - if chars.is_empty() { - if other_chars.is_empty() { - Ordering::Equal - } else { - Ordering::Less - } - } else if other_chars.is_empty() { - Ordering::Greater - } else { - let cmp = chars.len().cmp(&other_chars.len()); - if cmp != Ordering::Equal { - return cmp; - } - for (c, other_c) in chars.iter().zip(other_chars.iter()) { - let cmp = c.cmp(other_c); - if cmp != Ordering::Equal { - return cmp; + for (i, c) in set.chars().enumerate() { + if i > 0 { + write!(f, ", ")?; } + write!(f, "{:?}", c)?; } - Ordering::Equal + write!(f, "]")?; + Ok(()) } } @@ -624,48 +652,46 @@ mod tests { // multiple negated character classes ( vec![ - (CharacterSet::Include(vec!['a']), false, 0, 1), - (CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2), - (CharacterSet::Include(vec!['g']), false, 0, 6), - (CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3), - (CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4), - (CharacterSet::Include(vec!['g']), false, 0, 5), + (CharacterSet::from_char('a'), false, 0, 1), + (CharacterSet::from_range('a', 'c').negate(), false, 0, 2), + (CharacterSet::from_char('g'), false, 0, 6), + (CharacterSet::from_range('d', 'f').negate(), false, 0, 3), + (CharacterSet::from_range('g', 'i').negate(), false, 0, 4), + (CharacterSet::from_char('g'), false, 0, 5), ], vec![ NfaTransition { - characters: CharacterSet::Include(vec!['a']), + characters: CharacterSet::from_char('a'), precedence: 0, states: vec![1, 3, 4], is_separator: false, }, NfaTransition { - characters: CharacterSet::Include(vec!['g']), + characters: CharacterSet::from_char('g'), precedence: 0, states: vec![2, 3, 5, 6], is_separator: false, }, NfaTransition { - characters: CharacterSet::Include(vec!['b', 'c']), + characters: CharacterSet::from_range('b', 'c'), precedence: 0, states: vec![3, 4], is_separator: false, }, NfaTransition { - characters: CharacterSet::Include(vec!['h', 'i']), + characters: CharacterSet::from_range('h', 'i'), precedence: 0, states: vec![2, 3], is_separator: false, }, NfaTransition { - characters: CharacterSet::Include(vec!['d', 'e', 'f']), + characters: CharacterSet::from_range('d', 'f'), precedence: 0, states: vec![2, 4], is_separator: false, }, NfaTransition { - characters: CharacterSet::Exclude(vec![ - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', - ]), + characters: CharacterSet::from_range('a', 'i').negate(), precedence: 0, states: vec![2, 3, 4], is_separator: false, @@ -675,21 +701,21 @@ mod tests { // disjoint characters with same state ( vec![ - (CharacterSet::Include(vec!['a']), false, 0, 1), - (CharacterSet::Include(vec!['b']), false, 0, 2), - (CharacterSet::Include(vec!['c']), false, 0, 1), - (CharacterSet::Include(vec!['d']), false, 0, 1), - (CharacterSet::Include(vec!['e']), false, 0, 2), + (CharacterSet::from_char('a'), false, 0, 1), + (CharacterSet::from_char('b'), false, 0, 2), + (CharacterSet::from_char('c'), false, 0, 1), + (CharacterSet::from_char('d'), false, 0, 1), + (CharacterSet::from_char('e'), false, 0, 2), ], vec![ NfaTransition { - characters: CharacterSet::Include(vec!['a', 'c', 'd']), + characters: CharacterSet::empty().add_char('a').add_range('c', 'd'), precedence: 0, states: vec![1], is_separator: false, }, NfaTransition { - characters: CharacterSet::Include(vec!['b', 'e']), + characters: CharacterSet::empty().add_char('b').add_char('e'), precedence: 0, states: vec![2], is_separator: false, @@ -698,119 +724,129 @@ mod tests { ), ]; - for row in table.iter() { + for (i, row) in table.iter().enumerate() { assert_eq!( NfaCursor::group_transitions( row.0 .iter() .map(|(chars, is_sep, prec, state)| (chars, *is_sep, *prec, *state)) ), - row.1 + row.1, + "row {}", + i ); } } #[test] fn test_character_set_remove_intersection() { - // A whitelist and an overlapping whitelist. - // Both sets contain 'c', 'd', and 'f' - let mut a = CharacterSet::empty().add_range('a', 'f'); - let mut b = CharacterSet::empty().add_range('c', 'h'); - assert_eq!( - a.remove_intersection(&mut b), - CharacterSet::empty().add_range('c', 'f') - ); - assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); - assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + struct Row { + left: CharacterSet, + right: CharacterSet, + left_only: CharacterSet, + right_only: CharacterSet, + intersection: CharacterSet, + } - let mut a = CharacterSet::empty().add_range('a', 'f'); - let mut b = CharacterSet::empty().add_range('c', 'h'); - assert_eq!( - b.remove_intersection(&mut a), - CharacterSet::empty().add_range('c', 'f') - ); - assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); - assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + let rows = [ + // [ L ] + // [ R ] + Row { + left: CharacterSet::from_range('a', 'f'), + right: CharacterSet::from_range('g', 'm'), + left_only: CharacterSet::from_range('a', 'f'), + right_only: CharacterSet::from_range('g', 'm'), + intersection: CharacterSet::empty(), + }, + // [ L ] + // [ R ] + Row { + left: CharacterSet::from_range('a', 'f'), + right: CharacterSet::from_range('c', 'i'), + left_only: CharacterSet::from_range('a', 'b'), + right_only: CharacterSet::from_range('g', 'i'), + intersection: CharacterSet::from_range('c', 'f'), + }, + // [ L ] + // [ R ] + Row { + left: CharacterSet::from_range('a', 'f'), + right: CharacterSet::from_range('d', 'f'), + left_only: CharacterSet::from_range('a', 'c'), + right_only: CharacterSet::empty(), + intersection: CharacterSet::from_range('d', 'f'), + }, + // [ L ] + // [ R ] + Row { + left: CharacterSet::from_range('a', 'm'), + right: CharacterSet::from_range('d', 'f'), + left_only: CharacterSet::empty() + .add_range('a', 'c') + .add_range('g', 'm'), + right_only: CharacterSet::empty(), + intersection: CharacterSet::from_range('d', 'f'), + }, + // [ L1 ] [ L2 ] + // [ R ] + Row { + left: CharacterSet::empty() + .add_range('a', 'e') + .add_range('h', 'l'), + right: CharacterSet::from_range('c', 'i'), + left_only: CharacterSet::empty() + .add_range('a', 'b') + .add_range('j', 'l'), + right_only: CharacterSet::from_range('f', 'g'), + intersection: CharacterSet::empty() + .add_range('c', 'e') + .add_range('h', 'i'), + }, + ]; - // A whitelist and a larger whitelist. - let mut a = CharacterSet::empty().add_char('c'); - let mut b = CharacterSet::empty().add_range('a', 'e'); - assert_eq!( - a.remove_intersection(&mut b), - CharacterSet::empty().add_char('c') - ); - assert_eq!(a, CharacterSet::empty()); - assert_eq!( - b, - CharacterSet::empty() - .add_range('a', 'b') - .add_range('d', 'e') - ); + for (i, row) in rows.iter().enumerate() { + let mut left = row.left.clone(); + let mut right = row.right.clone(); + assert_eq!( + left.remove_intersection(&mut right), + row.intersection, + "row {}a: {:?} && {:?}", + i, + row.left, + row.right + ); + assert_eq!( + left, row.left_only, + "row {}a: {:?} - {:?}", + i, row.left, row.right + ); + assert_eq!( + right, row.right_only, + "row {}a: {:?} - {:?}", + i, row.right, row.left + ); - let mut a = CharacterSet::empty().add_char('c'); - let mut b = CharacterSet::empty().add_range('a', 'e'); - assert_eq!( - b.remove_intersection(&mut a), - CharacterSet::empty().add_char('c') - ); - assert_eq!(a, CharacterSet::empty()); - assert_eq!( - b, - CharacterSet::empty() - .add_range('a', 'b') - .add_range('d', 'e') - ); - - // An inclusion and an intersecting exclusion. - // Both sets contain 'e', 'f', and 'm' - let mut a = CharacterSet::empty() - .add_range('c', 'h') - .add_range('k', 'm'); - let mut b = CharacterSet::empty() - .add_range('a', 'd') - .add_range('g', 'l') - .negate(); - assert_eq!( - a.remove_intersection(&mut b), - CharacterSet::Include(vec!['e', 'f', 'm']) - ); - assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); - assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); - - let mut a = CharacterSet::empty() - .add_range('c', 'h') - .add_range('k', 'm'); - let mut b = CharacterSet::empty() - .add_range('a', 'd') - .add_range('g', 'l') - .negate(); - assert_eq!( - b.remove_intersection(&mut a), - CharacterSet::Include(vec!['e', 'f', 'm']) - ); - assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); - assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); - - // An exclusion and an overlapping inclusion. - // Both sets exclude 'c', 'd', and 'e' - let mut a = CharacterSet::empty().add_range('a', 'e').negate(); - let mut b = CharacterSet::empty().add_range('c', 'h').negate(); - assert_eq!( - a.remove_intersection(&mut b), - CharacterSet::empty().add_range('a', 'h').negate(), - ); - assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); - assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); - - // An exclusion and a larger exclusion. - let mut a = CharacterSet::empty().add_range('b', 'c').negate(); - let mut b = CharacterSet::empty().add_range('a', 'd').negate(); - assert_eq!( - a.remove_intersection(&mut b), - CharacterSet::empty().add_range('a', 'd').negate(), - ); - assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d')); - assert_eq!(b, CharacterSet::empty()); + let mut left = row.left.clone(); + let mut right = row.right.clone(); + assert_eq!( + right.remove_intersection(&mut left), + row.intersection, + "row {}b: {:?} && {:?}", + i, + row.left, + row.right + ); + assert_eq!( + left, row.left_only, + "row {}b: {:?} - {:?}", + i, row.left, row.right + ); + assert_eq!( + right, row.right_only, + "row {}b: {:?} - {:?}", + i, row.right, row.left + ); + } } #[test] @@ -834,29 +870,29 @@ mod tests { assert!(!b.does_intersect(&a)); let (a, b) = ( - CharacterSet::Include(vec!['b']), - CharacterSet::Exclude(vec!['a', 'b', 'c']), + CharacterSet::from_char('b'), + CharacterSet::from_range('a', 'c'), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::from_char('b'), + CharacterSet::from_range('a', 'c').negate(), ); assert!(!a.does_intersect(&b)); assert!(!b.does_intersect(&a)); let (a, b) = ( - CharacterSet::Include(vec!['b']), - CharacterSet::Exclude(vec!['a', 'c']), + CharacterSet::from_char('a').negate(), + CharacterSet::from_char('a').negate(), ); assert!(a.does_intersect(&b)); assert!(b.does_intersect(&a)); let (a, b) = ( - CharacterSet::Exclude(vec!['a']), - CharacterSet::Exclude(vec!['a']), - ); - assert!(a.does_intersect(&b)); - assert!(b.does_intersect(&a)); - - let (a, b) = ( - CharacterSet::Include(vec!['c']), - CharacterSet::Exclude(vec!['a']), + CharacterSet::from_char('c'), + CharacterSet::from_char('a').negate(), ); assert!(a.does_intersect(&b)); assert!(b.does_intersect(&a)); @@ -898,7 +934,11 @@ mod tests { .into_iter() .map(|c: &char| *c as u32) .collect(); - let ranges = CharacterSet::ranges(chars, &ruled_out_chars).collect::>(); + let mut set = CharacterSet::empty(); + for c in chars { + set = set.add_char(*c); + } + let ranges = set.simplify_ignoring(&ruled_out_chars); assert_eq!(ranges, *expected_ranges); } } diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index 9b594f3c..92c54b71 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -198,11 +198,11 @@ impl NfaBuilder { Ast::Empty(_) => Ok(false), Ast::Flags(_) => Err(Error::regex("Flags are not supported")), Ast::Literal(literal) => { - self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id); + self.push_advance(CharacterSet::from_char(literal.c), next_state_id); Ok(true) } Ast::Dot(_) => { - self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id); + self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id); Ok(true) } Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), @@ -344,11 +344,9 @@ impl NfaBuilder { fn expand_character_class(&self, item: &ClassSetItem) -> Result { match item { - ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), - ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), - ClassSetItem::Range(range) => { - Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) - } + ClassSetItem::Empty(_) => Ok(CharacterSet::empty()), + ClassSetItem::Literal(literal) => Ok(CharacterSet::from_char(literal.c)), + ClassSetItem::Range(range) => Ok(CharacterSet::from_range(range.start.c, range.end.c)), ClassSetItem::Union(union) => { let mut result = CharacterSet::empty(); for item in &union.items { @@ -366,7 +364,7 @@ impl NfaBuilder { fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet { match item { - ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Digit => CharacterSet::from_range('0', '9'), ClassPerlKind::Space => CharacterSet::empty() .add_char(' ') .add_char('\t') diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 58d99cc4..362f357c 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,5 +1,4 @@ use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; -use super::nfa::CharacterSet; use super::rules::{Alias, AliasMap, Symbol, SymbolType}; use super::tables::{ AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable, @@ -659,21 +658,19 @@ impl Generator { .advance_actions .iter() .map(|(chars, action)| { - let (chars, is_included) = match chars { - CharacterSet::Include(c) => (c, true), - CharacterSet::Exclude(c) => (c, false), - }; - let mut call_id = None; - let mut ranges = - CharacterSet::ranges(chars, &ruled_out_chars).collect::>(); + let is_included = !chars.contains(std::char::MAX); + let mut ranges; if is_included { - ruled_out_chars.extend(chars.iter().map(|c| *c as u32)); + ranges = chars.simplify_ignoring(&ruled_out_chars); + ruled_out_chars.extend(chars.iter()); } else { + ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars); ranges.insert(0, '\0'..'\0') } // Record any large character sets so that they can be extracted // into helper functions, reducing code duplication. + let mut call_id = None; if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT { let char_set_symbol = self .symbol_for_advance_action(action, &lex_table) @@ -887,11 +884,16 @@ impl Generator { add!(self, " &&{}lookahead != ", line_break); self.add_character(range.end); } else { - add!(self, "(lookahead < "); - self.add_character(range.start); - add!(self, " || "); - self.add_character(range.end); - add!(self, " < lookahead)"); + if range.start != '\0' { + add!(self, "(lookahead < "); + self.add_character(range.start); + add!(self, " || "); + self.add_character(range.end); + add!(self, " < lookahead)"); + } else { + add!(self, "lookahead > "); + self.add_character(range.end); + } } } did_add = true;