2018-12-08 13:44:11 -08:00
|
|
|
use std::char;
|
2018-12-29 13:56:00 -08:00
|
|
|
use std::cmp::max;
|
|
|
|
|
use std::cmp::Ordering;
|
2020-05-26 13:39:11 -07:00
|
|
|
use std::collections::HashSet;
|
2018-12-29 13:56:00 -08:00
|
|
|
use std::fmt;
|
|
|
|
|
use std::mem::swap;
|
2020-05-26 13:39:11 -07:00
|
|
|
use std::ops::Range;
|
2018-12-08 13:44:11 -08:00
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// A set of characters represented as a vector of ranges.
|
2021-01-28 14:58:29 -08:00
|
|
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
|
|
|
|
pub struct CharacterSet {
|
|
|
|
|
ranges: Vec<Range<u32>>,
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// A state in an NFA representing a regular grammar.
|
2018-12-08 23:35:48 -08:00
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
2018-12-08 13:44:11 -08:00
|
|
|
pub enum NfaState {
|
2018-12-12 18:04:29 -08:00
|
|
|
Advance {
|
|
|
|
|
chars: CharacterSet,
|
2018-12-12 20:58:26 -08:00
|
|
|
state_id: u32,
|
2018-12-12 18:04:29 -08:00
|
|
|
is_sep: bool,
|
2018-12-29 13:56:00 -08:00
|
|
|
precedence: i32,
|
2018-12-12 18:04:29 -08:00
|
|
|
},
|
2018-12-08 13:44:11 -08:00
|
|
|
Split(u32, u32),
|
2018-12-29 13:56:00 -08:00
|
|
|
Accept {
|
|
|
|
|
variable_index: usize,
|
|
|
|
|
precedence: i32,
|
|
|
|
|
},
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
#[derive(PartialEq, Eq, Default)]
|
2018-12-08 13:44:11 -08:00
|
|
|
pub struct Nfa {
|
2018-12-29 13:56:00 -08:00
|
|
|
pub states: Vec<NfaState>,
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
pub struct NfaCursor<'a> {
|
2018-12-12 20:58:26 -08:00
|
|
|
pub(crate) state_ids: Vec<u32>,
|
2018-12-08 13:44:11 -08:00
|
|
|
nfa: &'a Nfa,
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 09:42:06 -08:00
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
|
|
|
pub struct NfaTransition {
|
|
|
|
|
pub characters: CharacterSet,
|
|
|
|
|
pub is_separator: bool,
|
|
|
|
|
pub precedence: i32,
|
|
|
|
|
pub states: Vec<u32>,
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
const END: u32 = char::MAX as u32 + 1;
|
|
|
|
|
|
2018-12-08 13:44:11 -08:00
|
|
|
impl CharacterSet {
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Create a character set with a single character.
|
2024-02-04 01:30:33 -05:00
|
|
|
pub const fn empty() -> Self {
|
|
|
|
|
Self { ranges: Vec::new() }
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Create a character set with a given *inclusive* range of characters.
|
2024-02-04 01:30:33 -05:00
|
|
|
#[allow(clippy::single_range_in_vec_init)]
|
2021-01-28 14:58:29 -08:00
|
|
|
pub fn from_range(mut first: char, mut last: char) -> Self {
|
|
|
|
|
if first > last {
|
|
|
|
|
swap(&mut first, &mut last);
|
|
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
ranges: vec![(first as u32)..(last as u32 + 1)],
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Create a character set with a single character.
|
2024-02-04 01:30:33 -05:00
|
|
|
#[allow(clippy::single_range_in_vec_init)]
|
2021-01-28 14:58:29 -08:00
|
|
|
pub fn from_char(c: char) -> Self {
|
2024-02-04 01:30:33 -05:00
|
|
|
Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
ranges: vec![(c as u32)..(c as u32 + 1)],
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Create a character set containing all characters *not* present
|
|
|
|
|
/// in this character set.
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn negate(mut self) -> Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
let mut i = 0;
|
|
|
|
|
let mut previous_end = 0;
|
|
|
|
|
while i < self.ranges.len() {
|
|
|
|
|
let range = &mut self.ranges[i];
|
|
|
|
|
let start = previous_end;
|
|
|
|
|
previous_end = range.end;
|
|
|
|
|
if start < range.start {
|
|
|
|
|
self.ranges[i] = start..range.start;
|
|
|
|
|
i += 1;
|
|
|
|
|
} else {
|
|
|
|
|
self.ranges.remove(i);
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
if previous_end < END {
|
|
|
|
|
self.ranges.push(previous_end..END);
|
|
|
|
|
}
|
|
|
|
|
self
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
pub fn add_char(mut self, c: char) -> Self {
|
|
|
|
|
self.add_int_range(0, c as u32, c as u32 + 1);
|
|
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn add_range(mut self, start: char, end: char) -> Self {
|
|
|
|
|
self.add_int_range(0, start as u32, end as u32 + 1);
|
|
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn add(mut self, other: &Self) -> Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
let mut index = 0;
|
|
|
|
|
for range in &other.ranges {
|
2024-02-04 01:30:33 -05:00
|
|
|
index = self.add_int_range(index, range.start, range.end);
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize {
|
|
|
|
|
while i < self.ranges.len() {
|
|
|
|
|
let range = &mut self.ranges[i];
|
|
|
|
|
if range.start > end {
|
|
|
|
|
self.ranges.insert(i, start..end);
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
if range.end >= start {
|
|
|
|
|
range.end = range.end.max(end);
|
|
|
|
|
range.start = range.start.min(start);
|
2021-03-04 13:50:27 -08:00
|
|
|
|
|
|
|
|
// Join this range with the next range if needed.
|
|
|
|
|
while i + 1 < self.ranges.len() && self.ranges[i + 1].start <= self.ranges[i].end {
|
|
|
|
|
self.ranges[i].end = self.ranges[i].end.max(self.ranges[i + 1].end);
|
|
|
|
|
self.ranges.remove(i + 1);
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
i += 1;
|
|
|
|
|
}
|
|
|
|
|
self.ranges.push(start..end);
|
|
|
|
|
i
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn does_intersect(&self, other: &Self) -> bool {
|
2021-01-28 14:58:29 -08:00
|
|
|
let mut left_ranges = self.ranges.iter();
|
|
|
|
|
let mut right_ranges = other.ranges.iter();
|
|
|
|
|
let mut left_range = left_ranges.next();
|
|
|
|
|
let mut right_range = right_ranges.next();
|
|
|
|
|
while let (Some(left), Some(right)) = (&left_range, &right_range) {
|
|
|
|
|
if left.end <= right.start {
|
|
|
|
|
left_range = left_ranges.next();
|
|
|
|
|
} else if left.start >= right.end {
|
|
|
|
|
right_range = right_ranges.next();
|
|
|
|
|
} else {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2018-12-30 19:31:17 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
false
|
2018-12-30 19:31:17 -08:00
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Get the set of characters that are present in both this set
|
|
|
|
|
/// and the other set. Remove those common characters from both
|
|
|
|
|
/// of the operands.
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn remove_intersection(&mut self, other: &mut Self) -> Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
let mut intersection = Vec::new();
|
|
|
|
|
let mut left_i = 0;
|
|
|
|
|
let mut right_i = 0;
|
|
|
|
|
while left_i < self.ranges.len() && right_i < other.ranges.len() {
|
|
|
|
|
let left = &mut self.ranges[left_i];
|
|
|
|
|
let right = &mut other.ranges[right_i];
|
|
|
|
|
|
|
|
|
|
match left.start.cmp(&right.start) {
|
|
|
|
|
Ordering::Less => {
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
if left.end <= right.start {
|
|
|
|
|
left_i += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
match left.end.cmp(&right.end) {
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Less => {
|
|
|
|
|
intersection.push(right.start..left.end);
|
|
|
|
|
swap(&mut left.end, &mut right.start);
|
|
|
|
|
left_i += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Equal => {
|
|
|
|
|
intersection.push(right.clone());
|
|
|
|
|
left.end = right.start;
|
|
|
|
|
other.ranges.remove(right_i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Greater => {
|
|
|
|
|
intersection.push(right.clone());
|
|
|
|
|
let new_range = left.start..right.start;
|
|
|
|
|
left.start = right.end;
|
|
|
|
|
self.ranges.insert(left_i, new_range);
|
|
|
|
|
other.ranges.remove(right_i);
|
|
|
|
|
left_i += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Equal if left.end < right.end => {
|
|
|
|
|
intersection.push(left.start..left.end);
|
|
|
|
|
right.start = left.end;
|
|
|
|
|
self.ranges.remove(left_i);
|
|
|
|
|
}
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Equal if left.end == right.end => {
|
|
|
|
|
intersection.push(left.clone());
|
|
|
|
|
self.ranges.remove(left_i);
|
|
|
|
|
other.ranges.remove(right_i);
|
|
|
|
|
}
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Equal if left.end > right.end => {
|
|
|
|
|
intersection.push(right.clone());
|
|
|
|
|
left.start = right.end;
|
|
|
|
|
other.ranges.remove(right_i);
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
Ordering::Equal => {}
|
2021-01-28 14:58:29 -08:00
|
|
|
Ordering::Greater => {
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
if left.start >= right.end {
|
|
|
|
|
right_i += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
match left.end.cmp(&right.end) {
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Less => {
|
|
|
|
|
intersection.push(left.clone());
|
|
|
|
|
let new_range = right.start..left.start;
|
|
|
|
|
right.start = left.end;
|
|
|
|
|
other.ranges.insert(right_i, new_range);
|
|
|
|
|
self.ranges.remove(left_i);
|
|
|
|
|
right_i += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Equal => {
|
|
|
|
|
intersection.push(left.clone());
|
|
|
|
|
right.end = left.start;
|
|
|
|
|
self.ranges.remove(left_i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Ordering::Greater => {
|
|
|
|
|
intersection.push(left.start..right.end);
|
|
|
|
|
swap(&mut left.start, &mut right.end);
|
|
|
|
|
right_i += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
Self {
|
2021-01-28 14:58:29 -08:00
|
|
|
ranges: intersection,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
/// Produces a `CharacterSet` containing every character in `self` that is not present in
|
|
|
|
|
/// `other`.
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn difference(mut self, mut other: Self) -> Self {
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
self.remove_intersection(&mut other);
|
|
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
|
|
|
|
|
/// `other`, but is not present in both sets.
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn symmetric_difference(mut self, mut other: Self) -> Self {
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
self.remove_intersection(&mut other);
|
|
|
|
|
self.add(&other)
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
|
|
|
|
self.ranges.iter().flat_map(std::clone::Clone::clone)
|
2021-01-28 14:58:29 -08:00
|
|
|
}
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
|
2021-01-28 14:58:29 -08:00
|
|
|
self.iter().filter_map(char::from_u32)
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn is_empty(&self) -> bool {
|
2021-01-28 14:58:29 -08:00
|
|
|
self.ranges.is_empty()
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2021-02-16 21:37:52 -08:00
|
|
|
/// Get a reduced list of character ranges, assuming that a given
|
|
|
|
|
/// set of characters can be safely ignored.
|
2021-01-28 14:58:29 -08:00
|
|
|
pub fn simplify_ignoring<'a>(
|
|
|
|
|
&'a self,
|
2020-05-26 13:39:11 -07:00
|
|
|
ruled_out_characters: &'a HashSet<u32>,
|
2021-01-28 14:58:29 -08:00
|
|
|
) -> Vec<Range<char>> {
|
2020-05-26 13:39:11 -07:00
|
|
|
let mut prev_range: Option<Range<char>> = None;
|
2021-01-28 14:58:29 -08:00
|
|
|
self.chars()
|
|
|
|
|
.map(|c| (c, false))
|
2020-05-26 13:39:11 -07:00
|
|
|
.chain(Some(('\0', true)))
|
|
|
|
|
.filter_map(move |(c, done)| {
|
|
|
|
|
if done {
|
|
|
|
|
return prev_range.clone();
|
|
|
|
|
}
|
|
|
|
|
if ruled_out_characters.contains(&(c as u32)) {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
if let Some(range) = prev_range.clone() {
|
|
|
|
|
let mut prev_range_successor = range.end as u32 + 1;
|
|
|
|
|
while prev_range_successor < c as u32 {
|
|
|
|
|
if !ruled_out_characters.contains(&prev_range_successor) {
|
|
|
|
|
prev_range = Some(c..c);
|
|
|
|
|
return Some(range);
|
|
|
|
|
}
|
|
|
|
|
prev_range_successor += 1;
|
|
|
|
|
}
|
|
|
|
|
prev_range = Some(range.start..c);
|
|
|
|
|
} else {
|
|
|
|
|
prev_range = Some(c..c);
|
|
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
None
|
2020-05-26 13:39:11 -07:00
|
|
|
})
|
2021-01-28 14:58:29 -08:00
|
|
|
.collect()
|
2020-05-26 13:39:11 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-08 13:44:11 -08:00
|
|
|
pub fn contains(&self, c: char) -> bool {
|
2021-01-28 14:58:29 -08:00
|
|
|
self.ranges.iter().any(|r| r.contains(&(c as u32)))
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-29 13:56:00 -08:00
|
|
|
impl Ord for CharacterSet {
|
2024-02-04 01:30:33 -05:00
|
|
|
fn cmp(&self, other: &Self) -> Ordering {
|
2021-01-28 14:58:29 -08:00
|
|
|
let count_cmp = self
|
|
|
|
|
.ranges
|
|
|
|
|
.iter()
|
2024-02-04 01:30:33 -05:00
|
|
|
.map(std::iter::ExactSizeIterator::len)
|
2021-01-28 14:58:29 -08:00
|
|
|
.sum::<usize>()
|
2024-02-04 01:30:33 -05:00
|
|
|
.cmp(
|
|
|
|
|
&other
|
|
|
|
|
.ranges
|
|
|
|
|
.iter()
|
|
|
|
|
.map(std::iter::ExactSizeIterator::len)
|
|
|
|
|
.sum(),
|
|
|
|
|
);
|
2021-01-28 14:58:29 -08:00
|
|
|
if count_cmp != Ordering::Equal {
|
|
|
|
|
return count_cmp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (left_range, right_range) in self.ranges.iter().zip(other.ranges.iter()) {
|
|
|
|
|
let cmp = left_range.len().cmp(&right_range.len());
|
|
|
|
|
if cmp != Ordering::Equal {
|
|
|
|
|
return cmp;
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
|
|
|
|
|
for (left, right) in left_range.clone().zip(right_range.clone()) {
|
|
|
|
|
let cmp = left.cmp(&right);
|
|
|
|
|
if cmp != Ordering::Equal {
|
|
|
|
|
return cmp;
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
Ordering::Equal
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl PartialOrd for CharacterSet {
|
2024-02-04 01:30:33 -05:00
|
|
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
2018-12-29 13:56:00 -08:00
|
|
|
Some(self.cmp(other))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
impl fmt::Debug for CharacterSet {
|
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
|
write!(f, "CharacterSet [")?;
|
|
|
|
|
let mut set = self.clone();
|
|
|
|
|
if self.contains(char::MAX) {
|
|
|
|
|
write!(f, "^ ")?;
|
|
|
|
|
set = set.negate();
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
for (i, c) in set.chars().enumerate() {
|
|
|
|
|
if i > 0 {
|
|
|
|
|
write!(f, ", ")?;
|
2018-12-30 19:31:17 -08:00
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
write!(f, "{c:?}")?;
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2021-01-28 14:58:29 -08:00
|
|
|
write!(f, "]")?;
|
|
|
|
|
Ok(())
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-08 13:44:11 -08:00
|
|
|
impl Nfa {
|
2024-02-04 01:30:33 -05:00
|
|
|
pub const fn new() -> Self {
|
|
|
|
|
Self { states: Vec::new() }
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2018-12-12 20:58:26 -08:00
|
|
|
pub fn last_state_id(&self) -> u32 {
|
2018-12-08 13:44:11 -08:00
|
|
|
self.states.len() as u32 - 1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl fmt::Debug for Nfa {
|
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
2024-02-04 01:30:33 -05:00
|
|
|
writeln!(f, "Nfa {{ states: {{")?;
|
2018-12-08 13:44:11 -08:00
|
|
|
for (i, state) in self.states.iter().enumerate() {
|
2024-02-04 01:30:33 -05:00
|
|
|
writeln!(f, " {i}: {state:?},")?;
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
write!(f, "}} }}")?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl<'a> NfaCursor<'a> {
|
2018-12-12 20:58:26 -08:00
|
|
|
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
|
2018-12-29 13:56:00 -08:00
|
|
|
let mut result = Self {
|
|
|
|
|
nfa,
|
|
|
|
|
state_ids: Vec::new(),
|
|
|
|
|
};
|
2018-12-12 20:58:26 -08:00
|
|
|
result.add_states(&mut states);
|
2018-12-08 13:44:11 -08:00
|
|
|
result
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-29 13:56:00 -08:00
|
|
|
pub fn reset(&mut self, mut states: Vec<u32>) {
|
|
|
|
|
self.state_ids.clear();
|
|
|
|
|
self.add_states(&mut states);
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-02 16:48:44 -08:00
|
|
|
pub fn force_reset(&mut self, states: Vec<u32>) {
|
2024-02-04 01:30:33 -05:00
|
|
|
self.state_ids = states;
|
2019-01-02 16:48:44 -08:00
|
|
|
}
|
|
|
|
|
|
2019-01-04 09:42:06 -08:00
|
|
|
pub fn transition_chars(&self) -> impl Iterator<Item = (&CharacterSet, bool)> {
|
|
|
|
|
self.raw_transitions().map(|t| (t.0, t.1))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn transitions(&self) -> Vec<NfaTransition> {
|
|
|
|
|
Self::group_transitions(self.raw_transitions())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn raw_transitions(&self) -> impl Iterator<Item = (&CharacterSet, bool, i32, u32)> {
|
2018-12-29 13:56:00 -08:00
|
|
|
self.state_ids.iter().filter_map(move |id| {
|
|
|
|
|
if let NfaState::Advance {
|
|
|
|
|
chars,
|
|
|
|
|
state_id,
|
|
|
|
|
precedence,
|
2019-01-02 12:34:40 -08:00
|
|
|
is_sep,
|
2018-12-29 13:56:00 -08:00
|
|
|
} = &self.nfa.states[*id as usize]
|
|
|
|
|
{
|
2019-01-04 09:42:06 -08:00
|
|
|
Some((chars, *is_sep, *precedence, *state_id))
|
2018-12-29 13:56:00 -08:00
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 09:42:06 -08:00
|
|
|
fn group_transitions<'b>(
|
|
|
|
|
iter: impl Iterator<Item = (&'b CharacterSet, bool, i32, u32)>,
|
|
|
|
|
) -> Vec<NfaTransition> {
|
2024-02-19 14:50:29 -05:00
|
|
|
let mut result = Vec::<NfaTransition>::new();
|
2019-01-04 09:42:06 -08:00
|
|
|
for (chars, is_sep, prec, state) in iter {
|
2018-12-29 13:56:00 -08:00
|
|
|
let mut chars = chars.clone();
|
|
|
|
|
let mut i = 0;
|
2019-01-03 10:30:59 -08:00
|
|
|
while i < result.len() && !chars.is_empty() {
|
2019-01-04 09:42:06 -08:00
|
|
|
let intersection = result[i].characters.remove_intersection(&mut chars);
|
2019-01-02 16:48:44 -08:00
|
|
|
if !intersection.is_empty() {
|
2019-01-04 09:42:06 -08:00
|
|
|
let mut intersection_states = result[i].states.clone();
|
2024-02-04 01:30:33 -05:00
|
|
|
if let Err(j) = intersection_states.binary_search(&state) {
|
|
|
|
|
intersection_states.insert(j, state);
|
2019-01-03 10:30:59 -08:00
|
|
|
}
|
2019-01-04 09:42:06 -08:00
|
|
|
let intersection_transition = NfaTransition {
|
|
|
|
|
characters: intersection,
|
2019-01-15 12:13:42 -08:00
|
|
|
is_separator: result[i].is_separator && is_sep,
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: max(result[i].precedence, prec),
|
|
|
|
|
states: intersection_states,
|
|
|
|
|
};
|
|
|
|
|
if result[i].characters.is_empty() {
|
|
|
|
|
result[i] = intersection_transition;
|
2019-01-02 16:48:44 -08:00
|
|
|
} else {
|
2019-01-04 09:42:06 -08:00
|
|
|
result.insert(i, intersection_transition);
|
2018-12-30 19:31:17 -08:00
|
|
|
i += 1;
|
|
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
i += 1;
|
|
|
|
|
}
|
|
|
|
|
if !chars.is_empty() {
|
2019-01-04 09:42:06 -08:00
|
|
|
result.push(NfaTransition {
|
|
|
|
|
characters: chars,
|
|
|
|
|
precedence: prec,
|
|
|
|
|
states: vec![state],
|
|
|
|
|
is_separator: is_sep,
|
|
|
|
|
});
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
|
|
|
|
}
|
2019-01-04 09:42:06 -08:00
|
|
|
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
|
2019-04-04 16:02:50 -07:00
|
|
|
|
|
|
|
|
let mut i = 0;
|
|
|
|
|
'i_loop: while i < result.len() {
|
|
|
|
|
for j in 0..i {
|
|
|
|
|
if result[j].states == result[i].states
|
|
|
|
|
&& result[j].is_separator == result[i].is_separator
|
|
|
|
|
&& result[j].precedence == result[i].precedence
|
|
|
|
|
{
|
|
|
|
|
let mut characters = CharacterSet::empty();
|
|
|
|
|
swap(&mut characters, &mut result[j].characters);
|
|
|
|
|
result[j].characters = characters.add(&result[i].characters);
|
|
|
|
|
result.remove(i);
|
|
|
|
|
continue 'i_loop;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
i += 1;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-29 13:56:00 -08:00
|
|
|
result
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-30 19:31:17 -08:00
|
|
|
pub fn completions(&self) -> impl Iterator<Item = (usize, i32)> + '_ {
|
|
|
|
|
self.state_ids.iter().filter_map(move |state_id| {
|
2018-12-29 13:56:00 -08:00
|
|
|
if let NfaState::Accept {
|
|
|
|
|
variable_index,
|
|
|
|
|
precedence,
|
|
|
|
|
} = self.nfa.states[*state_id as usize]
|
|
|
|
|
{
|
2018-12-30 19:31:17 -08:00
|
|
|
Some((variable_index, precedence))
|
|
|
|
|
} else {
|
|
|
|
|
None
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
2018-12-30 19:31:17 -08:00
|
|
|
})
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
|
2018-12-12 20:58:26 -08:00
|
|
|
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
|
|
|
|
|
let mut i = 0;
|
|
|
|
|
while i < new_state_ids.len() {
|
|
|
|
|
let state_id = new_state_ids[i];
|
|
|
|
|
let state = &self.nfa.states[state_id as usize];
|
2018-12-08 13:44:11 -08:00
|
|
|
if let NfaState::Split(left, right) = state {
|
2018-12-12 20:58:26 -08:00
|
|
|
let mut has_left = false;
|
|
|
|
|
let mut has_right = false;
|
|
|
|
|
for new_state_id in new_state_ids.iter() {
|
|
|
|
|
if *new_state_id == *left {
|
|
|
|
|
has_left = true;
|
|
|
|
|
}
|
|
|
|
|
if *new_state_id == *right {
|
|
|
|
|
has_right = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if !has_left {
|
|
|
|
|
new_state_ids.push(*left);
|
|
|
|
|
}
|
|
|
|
|
if !has_right {
|
|
|
|
|
new_state_ids.push(*right);
|
|
|
|
|
}
|
|
|
|
|
} else if let Err(i) = self.state_ids.binary_search(&state_id) {
|
|
|
|
|
self.state_ids.insert(i, state_id);
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
2018-12-12 20:58:26 -08:00
|
|
|
i += 1;
|
2018-12-08 13:44:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
2021-03-04 13:50:27 -08:00
|
|
|
#[test]
|
|
|
|
|
fn test_adding_ranges() {
|
|
|
|
|
let mut set = CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'm')
|
|
|
|
|
.add_range('q', 's');
|
|
|
|
|
|
|
|
|
|
// within existing range
|
|
|
|
|
set = set.add_char('d');
|
|
|
|
|
assert_eq!(
|
|
|
|
|
set,
|
|
|
|
|
CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'm')
|
|
|
|
|
.add_range('q', 's')
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// at end of existing range
|
|
|
|
|
set = set.add_char('m');
|
|
|
|
|
assert_eq!(
|
|
|
|
|
set,
|
|
|
|
|
CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'm')
|
|
|
|
|
.add_range('q', 's')
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// adjacent to end of existing range
|
|
|
|
|
set = set.add_char('n');
|
|
|
|
|
assert_eq!(
|
|
|
|
|
set,
|
|
|
|
|
CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'n')
|
|
|
|
|
.add_range('q', 's')
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// filling gap between existing ranges
|
|
|
|
|
set = set.add_range('o', 'p');
|
|
|
|
|
assert_eq!(set, CharacterSet::empty().add_range('c', 's'));
|
|
|
|
|
|
|
|
|
|
set = CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'f')
|
|
|
|
|
.add_range('i', 'l')
|
|
|
|
|
.add_range('n', 'r');
|
|
|
|
|
set = set.add_range('d', 'o');
|
|
|
|
|
assert_eq!(set, CharacterSet::empty().add_range('c', 'r'));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_adding_sets() {
|
|
|
|
|
let set1 = CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'f')
|
|
|
|
|
.add_range('i', 'l');
|
|
|
|
|
let set2 = CharacterSet::empty().add_range('b', 'g').add_char('h');
|
|
|
|
|
assert_eq!(
|
|
|
|
|
set1.add(&set2),
|
|
|
|
|
CharacterSet::empty()
|
|
|
|
|
.add_range('b', 'g')
|
|
|
|
|
.add_range('h', 'l')
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-29 13:56:00 -08:00
|
|
|
#[test]
|
2019-01-04 09:42:06 -08:00
|
|
|
fn test_group_transitions() {
|
2018-12-29 13:56:00 -08:00
|
|
|
let table = [
|
2019-01-03 10:30:59 -08:00
|
|
|
// overlapping character classes
|
2018-12-29 13:56:00 -08:00
|
|
|
(
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
(CharacterSet::empty().add_range('a', 'f'), false, 0, 1),
|
|
|
|
|
(CharacterSet::empty().add_range('d', 'i'), false, 1, 2),
|
2018-12-29 13:56:00 -08:00
|
|
|
],
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_range('a', 'c'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1],
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_range('d', 'f'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 1,
|
|
|
|
|
states: vec![1, 2],
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_range('g', 'i'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 1,
|
|
|
|
|
states: vec![2],
|
|
|
|
|
},
|
2018-12-29 13:56:00 -08:00
|
|
|
],
|
|
|
|
|
),
|
2019-01-03 10:30:59 -08:00
|
|
|
// large character class followed by many individual characters
|
2018-12-29 13:56:00 -08:00
|
|
|
(
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
(CharacterSet::empty().add_range('a', 'z'), false, 0, 1),
|
|
|
|
|
(CharacterSet::empty().add_char('d'), false, 0, 2),
|
|
|
|
|
(CharacterSet::empty().add_char('i'), false, 0, 3),
|
|
|
|
|
(CharacterSet::empty().add_char('f'), false, 0, 4),
|
2018-12-29 13:56:00 -08:00
|
|
|
],
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('d'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1, 2],
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('f'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1, 4],
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('i'),
|
|
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1, 3],
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty()
|
2018-12-29 13:56:00 -08:00
|
|
|
.add_range('a', 'c')
|
|
|
|
|
.add_char('e')
|
|
|
|
|
.add_range('g', 'h')
|
|
|
|
|
.add_range('j', 'z'),
|
2019-01-04 09:42:06 -08:00
|
|
|
is_separator: false,
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1],
|
|
|
|
|
},
|
2018-12-29 13:56:00 -08:00
|
|
|
],
|
|
|
|
|
),
|
2019-01-03 10:30:59 -08:00
|
|
|
// negated character class followed by an individual character
|
|
|
|
|
(
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
(CharacterSet::empty().add_char('0'), false, 0, 1),
|
|
|
|
|
(CharacterSet::empty().add_char('b'), false, 0, 2),
|
2019-01-03 10:30:59 -08:00
|
|
|
(
|
|
|
|
|
CharacterSet::empty().add_range('a', 'f').negate(),
|
2019-01-04 09:42:06 -08:00
|
|
|
false,
|
2019-01-03 10:30:59 -08:00
|
|
|
0,
|
|
|
|
|
3,
|
|
|
|
|
),
|
2019-01-04 09:42:06 -08:00
|
|
|
(CharacterSet::empty().add_char('c'), false, 0, 4),
|
2019-01-03 10:30:59 -08:00
|
|
|
],
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('0'),
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1, 3],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('b'),
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty().add_char('c'),
|
|
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![4],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
|
|
|
|
characters: CharacterSet::empty()
|
2019-01-03 10:30:59 -08:00
|
|
|
.add_range('a', 'f')
|
|
|
|
|
.add_char('0')
|
|
|
|
|
.negate(),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![3],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
2019-01-03 10:30:59 -08:00
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
// multiple negated character classes
|
|
|
|
|
(
|
|
|
|
|
vec![
|
2021-01-28 14:58:29 -08:00
|
|
|
(CharacterSet::from_char('a'), false, 0, 1),
|
|
|
|
|
(CharacterSet::from_range('a', 'c').negate(), false, 0, 2),
|
|
|
|
|
(CharacterSet::from_char('g'), false, 0, 6),
|
|
|
|
|
(CharacterSet::from_range('d', 'f').negate(), false, 0, 3),
|
|
|
|
|
(CharacterSet::from_range('g', 'i').negate(), false, 0, 4),
|
|
|
|
|
(CharacterSet::from_char('g'), false, 0, 5),
|
2019-01-03 10:30:59 -08:00
|
|
|
],
|
|
|
|
|
vec![
|
2019-01-04 09:42:06 -08:00
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_char('a'),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1, 3, 4],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_char('g'),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2, 3, 5, 6],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_range('b', 'c'),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![3, 4],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_range('h', 'i'),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2, 3],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_range('d', 'f'),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2, 4],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::from_range('a', 'i').negate(),
|
2019-01-04 09:42:06 -08:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2, 3, 4],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
2019-01-03 10:30:59 -08:00
|
|
|
],
|
|
|
|
|
),
|
2019-04-04 16:02:50 -07:00
|
|
|
// disjoint characters with same state
|
|
|
|
|
(
|
|
|
|
|
vec![
|
2021-01-28 14:58:29 -08:00
|
|
|
(CharacterSet::from_char('a'), false, 0, 1),
|
|
|
|
|
(CharacterSet::from_char('b'), false, 0, 2),
|
|
|
|
|
(CharacterSet::from_char('c'), false, 0, 1),
|
|
|
|
|
(CharacterSet::from_char('d'), false, 0, 1),
|
|
|
|
|
(CharacterSet::from_char('e'), false, 0, 2),
|
2019-04-04 16:02:50 -07:00
|
|
|
],
|
|
|
|
|
vec![
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::empty().add_char('a').add_range('c', 'd'),
|
2019-04-04 16:02:50 -07:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![1],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
NfaTransition {
|
2021-01-28 14:58:29 -08:00
|
|
|
characters: CharacterSet::empty().add_char('b').add_char('e'),
|
2019-04-04 16:02:50 -07:00
|
|
|
precedence: 0,
|
|
|
|
|
states: vec![2],
|
|
|
|
|
is_separator: false,
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
),
|
2018-12-29 13:56:00 -08:00
|
|
|
];
|
|
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
for (i, row) in table.iter().enumerate() {
|
2018-12-29 13:56:00 -08:00
|
|
|
assert_eq!(
|
2019-04-04 16:02:50 -07:00
|
|
|
NfaCursor::group_transitions(
|
|
|
|
|
row.0
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|(chars, is_sep, prec, state)| (chars, *is_sep, *prec, *state))
|
|
|
|
|
),
|
2021-01-28 14:58:29 -08:00
|
|
|
row.1,
|
2024-02-04 01:30:33 -05:00
|
|
|
"row {i}",
|
2018-12-29 13:56:00 -08:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
fn test_character_set_intersection_difference_ops() {
|
2021-01-28 14:58:29 -08:00
|
|
|
struct Row {
|
|
|
|
|
left: CharacterSet,
|
|
|
|
|
right: CharacterSet,
|
|
|
|
|
left_only: CharacterSet,
|
|
|
|
|
right_only: CharacterSet,
|
|
|
|
|
intersection: CharacterSet,
|
|
|
|
|
}
|
2019-01-03 10:30:59 -08:00
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
let rows = [
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range('a', 'f'),
|
|
|
|
|
right: CharacterSet::from_range('g', 'm'),
|
|
|
|
|
left_only: CharacterSet::from_range('a', 'f'),
|
|
|
|
|
right_only: CharacterSet::from_range('g', 'm'),
|
|
|
|
|
intersection: CharacterSet::empty(),
|
|
|
|
|
},
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range('a', 'f'),
|
|
|
|
|
right: CharacterSet::from_range('c', 'i'),
|
|
|
|
|
left_only: CharacterSet::from_range('a', 'b'),
|
|
|
|
|
right_only: CharacterSet::from_range('g', 'i'),
|
|
|
|
|
intersection: CharacterSet::from_range('c', 'f'),
|
|
|
|
|
},
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range('a', 'f'),
|
|
|
|
|
right: CharacterSet::from_range('d', 'f'),
|
|
|
|
|
left_only: CharacterSet::from_range('a', 'c'),
|
|
|
|
|
right_only: CharacterSet::empty(),
|
|
|
|
|
intersection: CharacterSet::from_range('d', 'f'),
|
|
|
|
|
},
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range('a', 'm'),
|
|
|
|
|
right: CharacterSet::from_range('d', 'f'),
|
|
|
|
|
left_only: CharacterSet::empty()
|
|
|
|
|
.add_range('a', 'c')
|
|
|
|
|
.add_range('g', 'm'),
|
|
|
|
|
right_only: CharacterSet::empty(),
|
|
|
|
|
intersection: CharacterSet::from_range('d', 'f'),
|
|
|
|
|
},
|
2021-03-04 13:50:27 -08:00
|
|
|
// [ L ]
|
|
|
|
|
// [R]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range(',', '/'),
|
|
|
|
|
right: CharacterSet::from_char('/'),
|
|
|
|
|
left_only: CharacterSet::from_range(',', '.'),
|
|
|
|
|
right_only: CharacterSet::empty(),
|
|
|
|
|
intersection: CharacterSet::from_char('/'),
|
|
|
|
|
},
|
|
|
|
|
// [ L ]
|
|
|
|
|
// [R]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::from_range(',', '/'),
|
|
|
|
|
right: CharacterSet::from_char('/'),
|
|
|
|
|
left_only: CharacterSet::from_range(',', '.'),
|
|
|
|
|
right_only: CharacterSet::empty(),
|
|
|
|
|
intersection: CharacterSet::from_char('/'),
|
|
|
|
|
},
|
2021-01-28 14:58:29 -08:00
|
|
|
// [ L1 ] [ L2 ]
|
|
|
|
|
// [ R ]
|
|
|
|
|
Row {
|
|
|
|
|
left: CharacterSet::empty()
|
|
|
|
|
.add_range('a', 'e')
|
|
|
|
|
.add_range('h', 'l'),
|
|
|
|
|
right: CharacterSet::from_range('c', 'i'),
|
|
|
|
|
left_only: CharacterSet::empty()
|
|
|
|
|
.add_range('a', 'b')
|
|
|
|
|
.add_range('j', 'l'),
|
|
|
|
|
right_only: CharacterSet::from_range('f', 'g'),
|
|
|
|
|
intersection: CharacterSet::empty()
|
|
|
|
|
.add_range('c', 'e')
|
|
|
|
|
.add_range('h', 'i'),
|
|
|
|
|
},
|
|
|
|
|
];
|
2019-01-03 10:30:59 -08:00
|
|
|
|
2021-01-28 14:58:29 -08:00
|
|
|
for (i, row) in rows.iter().enumerate() {
|
|
|
|
|
let mut left = row.left.clone();
|
|
|
|
|
let mut right = row.right.clone();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
left.remove_intersection(&mut right),
|
|
|
|
|
row.intersection,
|
|
|
|
|
"row {}a: {:?} && {:?}",
|
|
|
|
|
i,
|
|
|
|
|
row.left,
|
|
|
|
|
row.right
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
left, row.left_only,
|
|
|
|
|
"row {}a: {:?} - {:?}",
|
|
|
|
|
i, row.left, row.right
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
right, row.right_only,
|
|
|
|
|
"row {}a: {:?} - {:?}",
|
|
|
|
|
i, row.right, row.left
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let mut left = row.left.clone();
|
|
|
|
|
let mut right = row.right.clone();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
right.remove_intersection(&mut left),
|
|
|
|
|
row.intersection,
|
|
|
|
|
"row {}b: {:?} && {:?}",
|
|
|
|
|
i,
|
|
|
|
|
row.left,
|
|
|
|
|
row.right
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
left, row.left_only,
|
|
|
|
|
"row {}b: {:?} - {:?}",
|
|
|
|
|
i, row.left, row.right
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
right, row.right_only,
|
|
|
|
|
"row {}b: {:?} - {:?}",
|
|
|
|
|
i, row.right, row.left
|
|
|
|
|
);
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
row.left.clone().difference(row.right.clone()),
|
|
|
|
|
row.left_only,
|
|
|
|
|
"row {}b: {:?} -- {:?}",
|
|
|
|
|
i,
|
|
|
|
|
row.left,
|
|
|
|
|
row.right
|
|
|
|
|
);
|
|
|
|
|
|
2024-02-04 01:30:33 -05:00
|
|
|
let symm_difference = row.left_only.clone().add(&row.right_only);
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
assert_eq!(
|
|
|
|
|
row.left.clone().symmetric_difference(row.right.clone()),
|
|
|
|
|
symm_difference,
|
2024-02-04 01:30:33 -05:00
|
|
|
"row {i}b: {:?} ~~ {:?}",
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
row.left,
|
|
|
|
|
row.right
|
2024-02-06 23:18:27 +01:00
|
|
|
);
|
2021-01-28 14:58:29 -08:00
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|
2018-12-30 19:31:17 -08:00
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_character_set_does_intersect() {
|
|
|
|
|
let (a, b) = (CharacterSet::empty(), CharacterSet::empty());
|
|
|
|
|
assert!(!a.does_intersect(&b));
|
|
|
|
|
assert!(!b.does_intersect(&a));
|
|
|
|
|
|
|
|
|
|
let (a, b) = (
|
|
|
|
|
CharacterSet::empty().add_char('a'),
|
|
|
|
|
CharacterSet::empty().add_char('a'),
|
|
|
|
|
);
|
|
|
|
|
assert!(a.does_intersect(&b));
|
|
|
|
|
assert!(b.does_intersect(&a));
|
|
|
|
|
|
|
|
|
|
let (a, b) = (
|
|
|
|
|
CharacterSet::empty().add_char('b'),
|
|
|
|
|
CharacterSet::empty().add_char('a').add_char('c'),
|
|
|
|
|
);
|
|
|
|
|
assert!(!a.does_intersect(&b));
|
|
|
|
|
assert!(!b.does_intersect(&a));
|
|
|
|
|
|
|
|
|
|
let (a, b) = (
|
2021-01-28 14:58:29 -08:00
|
|
|
CharacterSet::from_char('b'),
|
|
|
|
|
CharacterSet::from_range('a', 'c'),
|
2018-12-30 19:31:17 -08:00
|
|
|
);
|
2021-01-28 14:58:29 -08:00
|
|
|
assert!(a.does_intersect(&b));
|
|
|
|
|
assert!(b.does_intersect(&a));
|
2018-12-30 19:31:17 -08:00
|
|
|
|
|
|
|
|
let (a, b) = (
|
2021-01-28 14:58:29 -08:00
|
|
|
CharacterSet::from_char('b'),
|
|
|
|
|
CharacterSet::from_range('a', 'c').negate(),
|
2018-12-30 19:31:17 -08:00
|
|
|
);
|
2021-01-28 14:58:29 -08:00
|
|
|
assert!(!a.does_intersect(&b));
|
|
|
|
|
assert!(!b.does_intersect(&a));
|
2018-12-30 19:31:17 -08:00
|
|
|
|
|
|
|
|
let (a, b) = (
|
2021-01-28 14:58:29 -08:00
|
|
|
CharacterSet::from_char('a').negate(),
|
|
|
|
|
CharacterSet::from_char('a').negate(),
|
2018-12-30 19:31:17 -08:00
|
|
|
);
|
|
|
|
|
assert!(a.does_intersect(&b));
|
|
|
|
|
assert!(b.does_intersect(&a));
|
2019-09-19 11:50:38 -07:00
|
|
|
|
|
|
|
|
let (a, b) = (
|
2021-01-28 14:58:29 -08:00
|
|
|
CharacterSet::from_char('c'),
|
|
|
|
|
CharacterSet::from_char('a').negate(),
|
2019-09-19 11:50:38 -07:00
|
|
|
);
|
|
|
|
|
assert!(a.does_intersect(&b));
|
|
|
|
|
assert!(b.does_intersect(&a));
|
2021-03-04 13:50:27 -08:00
|
|
|
|
|
|
|
|
let (a, b) = (
|
|
|
|
|
CharacterSet::from_range('c', 'f'),
|
|
|
|
|
CharacterSet::from_char('f'),
|
|
|
|
|
);
|
|
|
|
|
assert!(a.does_intersect(&b));
|
|
|
|
|
assert!(b.does_intersect(&a));
|
2018-12-30 19:31:17 -08:00
|
|
|
}
|
2020-05-26 13:39:11 -07:00
|
|
|
|
|
|
|
|
#[test]
|
2024-02-06 23:18:27 +01:00
|
|
|
#[allow(clippy::single_range_in_vec_init)]
|
2020-05-26 13:39:11 -07:00
|
|
|
fn test_character_set_get_ranges() {
|
|
|
|
|
struct Row {
|
|
|
|
|
chars: Vec<char>,
|
|
|
|
|
ruled_out_chars: Vec<char>,
|
|
|
|
|
expected_ranges: Vec<Range<char>>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let table = [
|
|
|
|
|
Row {
|
|
|
|
|
chars: vec!['a'],
|
|
|
|
|
ruled_out_chars: vec![],
|
|
|
|
|
expected_ranges: vec!['a'..'a'],
|
|
|
|
|
},
|
|
|
|
|
Row {
|
|
|
|
|
chars: vec!['a', 'b', 'c', 'e', 'z'],
|
|
|
|
|
ruled_out_chars: vec![],
|
|
|
|
|
expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'],
|
|
|
|
|
},
|
|
|
|
|
Row {
|
|
|
|
|
chars: vec!['a', 'b', 'c', 'e', 'h', 'z'],
|
|
|
|
|
ruled_out_chars: vec!['d', 'f', 'g'],
|
|
|
|
|
expected_ranges: vec!['a'..'h', 'z'..'z'],
|
|
|
|
|
},
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
for Row {
|
|
|
|
|
chars,
|
|
|
|
|
ruled_out_chars,
|
|
|
|
|
expected_ranges,
|
2024-02-06 23:18:27 +01:00
|
|
|
} in &table
|
2020-05-26 13:39:11 -07:00
|
|
|
{
|
2024-02-04 01:30:33 -05:00
|
|
|
let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect();
|
2021-01-28 14:58:29 -08:00
|
|
|
let mut set = CharacterSet::empty();
|
|
|
|
|
for c in chars {
|
|
|
|
|
set = set.add_char(*c);
|
|
|
|
|
}
|
|
|
|
|
let ranges = set.simplify_ignoring(&ruled_out_chars);
|
2020-05-26 13:39:11 -07:00
|
|
|
assert_eq!(ranges, *expected_ranges);
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-12-29 13:56:00 -08:00
|
|
|
}
|