tree-sitter/src/nfa.rs
2018-12-29 13:56:00 -08:00

532 lines
17 KiB
Rust

use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::fmt;
use std::mem::swap;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CharacterSet {
Include(Vec<char>),
Exclude(Vec<char>),
}
#[derive(Debug, PartialEq, Eq)]
pub enum NfaState {
Advance {
chars: CharacterSet,
state_id: u32,
is_sep: bool,
precedence: i32,
},
Split(u32, u32),
Accept {
variable_index: usize,
precedence: i32,
},
}
#[derive(PartialEq, Eq)]
pub struct Nfa {
pub states: Vec<NfaState>,
}
impl Default for Nfa {
fn default() -> Self {
Self { states: Vec::new() }
}
}
#[derive(Debug)]
pub struct NfaCursor<'a> {
pub(crate) state_ids: Vec<u32>,
nfa: &'a Nfa,
in_sep: bool,
}
impl CharacterSet {
pub fn empty() -> Self {
CharacterSet::Include(Vec::new())
}
pub fn all() -> Self {
CharacterSet::Exclude(Vec::new())
}
pub fn negate(self) -> CharacterSet {
match self {
CharacterSet::Include(chars) => CharacterSet::Exclude(chars),
CharacterSet::Exclude(chars) => CharacterSet::Include(chars),
}
}
pub fn add_char(self, c: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
if let Err(i) = chars.binary_search(&c) {
chars.insert(i, c);
}
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add_range(self, start: char, end: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
let mut c = start as u32;
while c <= end as u32 {
chars.push(char::from_u32(c).unwrap());
c += 1;
}
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add(self, other: &CharacterSet) -> Self {
if let CharacterSet::Include(other_chars) = other {
if let CharacterSet::Include(mut chars) = self {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
return CharacterSet::Include(chars);
}
}
panic!("Called add with a negated character set");
}
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => {
CharacterSet::Include(remove_chars(chars, other_chars, true))
}
CharacterSet::Exclude(other_chars) => {
let mut removed = remove_chars(chars, other_chars, false);
add_chars(other_chars, chars);
swap(&mut removed, chars);
CharacterSet::Include(removed)
}
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => {
let mut removed = remove_chars(other_chars, chars, false);
add_chars(chars, other_chars);
swap(&mut removed, other_chars);
CharacterSet::Include(removed)
}
CharacterSet::Exclude(other_chars) => {
let removed = remove_chars(chars, other_chars, true);
let mut included_characters = Vec::new();
let mut other_included_characters = Vec::new();
swap(&mut included_characters, other_chars);
swap(&mut other_included_characters, chars);
*self = CharacterSet::Include(included_characters);
*other = CharacterSet::Include(other_included_characters);
CharacterSet::Exclude(removed)
}
},
}
}
pub fn is_empty(&self) -> bool {
if let CharacterSet::Include(c) = self {
c.is_empty()
} else {
false
}
}
pub fn contains(&self, c: char) -> bool {
match self {
CharacterSet::Include(chars) => chars.contains(&c),
CharacterSet::Exclude(chars) => !chars.contains(&c),
}
}
}
impl Ord for CharacterSet {
fn cmp(&self, other: &CharacterSet) -> Ordering {
match self {
CharacterSet::Include(chars) => {
if let CharacterSet::Include(other_chars) = other {
compare_chars(chars, other_chars)
} else {
Ordering::Less
}
}
CharacterSet::Exclude(chars) => {
if let CharacterSet::Exclude(other_chars) = other {
compare_chars(chars, other_chars)
} else {
Ordering::Greater
}
}
}
}
}
impl PartialOrd for CharacterSet {
fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
Some(self.cmp(other))
}
}
fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
for c in right {
match left.binary_search(c) {
Err(i) => left.insert(i, *c),
_ => {}
}
}
}
fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
let mut result = Vec::new();
right.retain(|right_char| {
if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
left.remove(index);
result.push(*right_char);
false || !mutate_right
} else {
true
}
});
result
}
fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
if chars.is_empty() {
if other_chars.is_empty() {
Ordering::Equal
} else {
Ordering::Less
}
} else if other_chars.is_empty() {
Ordering::Greater
} else {
let mut other_c = other_chars.iter();
for c in chars.iter() {
if let Some(other_c) = other_c.next() {
let cmp = c.cmp(other_c);
if cmp != Ordering::Equal {
return cmp;
}
} else {
return Ordering::Greater;
}
}
if other_c.next().is_some() {
return Ordering::Less;
}
Ordering::Equal
}
}
impl Nfa {
pub fn new() -> Self {
Nfa { states: Vec::new() }
}
pub fn last_state_id(&self) -> u32 {
self.states.len() as u32 - 1
}
pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
self.states.push(f(self.last_state_id()));
}
}
impl fmt::Debug for Nfa {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Nfa {{ states: {{\n")?;
for (i, state) in self.states.iter().enumerate() {
write!(f, " {}: {:?},\n", i, state)?;
}
write!(f, "}} }}")?;
Ok(())
}
}
impl<'a> NfaCursor<'a> {
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
let mut result = Self {
nfa,
state_ids: Vec::new(),
in_sep: true,
};
result.add_states(&mut states);
result
}
pub fn reset(&mut self, mut states: Vec<u32>) {
self.state_ids.clear();
self.add_states(&mut states);
}
pub fn advance(&mut self, c: char) -> bool {
let mut result = false;
let mut new_state_ids = Vec::new();
let mut any_sep_transitions = false;
for current_state_id in &self.state_ids {
if let NfaState::Advance {
chars,
state_id,
is_sep,
..
} = &self.nfa.states[*current_state_id as usize]
{
if chars.contains(c) {
if *is_sep {
any_sep_transitions = true;
}
new_state_ids.push(*state_id);
result = true;
}
}
}
if !any_sep_transitions {
self.in_sep = false;
}
self.state_ids.clear();
self.add_states(&mut new_state_ids);
result
}
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
self.state_ids.iter().filter_map(move |id| {
if let NfaState::Advance {
chars,
state_id,
precedence,
..
} = &self.nfa.states[*id as usize]
{
Some((chars, *precedence, *state_id))
} else {
None
}
})
}
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
Self::group_successors(self.successors())
}
fn group_successors<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
) -> Vec<(CharacterSet, i32, Vec<u32>)> {
let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
for (chars, prec, state) in iter {
let mut chars = chars.clone();
let mut i = 0;
while i < result.len() {
let intersection = result[i].0.remove_intersection(&mut chars);
if !intersection.is_empty() {
let mut states = result[i].2.clone();
let mut precedence = result[i].1;
states.push(state);
result.insert(i, (intersection, max(precedence, prec), states));
i += 1;
}
i += 1;
}
if !chars.is_empty() {
result.push((chars, prec, vec![state]));
}
}
result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
result
}
pub fn finished_id(&self) -> Option<(usize, i32)> {
let mut result = None;
for state_id in self.state_ids.iter() {
if let NfaState::Accept {
variable_index,
precedence,
} = self.nfa.states[*state_id as usize]
{
match result {
None => result = Some((variable_index, precedence)),
Some((existing_id, existing_precedence)) => {
if precedence > existing_precedence
|| (precedence == existing_precedence && variable_index < existing_id)
{
result = Some((variable_index, precedence))
}
}
}
}
}
result
}
pub fn in_separator(&self) -> bool {
self.in_sep
}
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
let mut i = 0;
while i < new_state_ids.len() {
let state_id = new_state_ids[i];
let state = &self.nfa.states[state_id as usize];
if let NfaState::Split(left, right) = state {
let mut has_left = false;
let mut has_right = false;
for new_state_id in new_state_ids.iter() {
if *new_state_id == *left {
has_left = true;
}
if *new_state_id == *right {
has_right = true;
}
}
if !has_left {
new_state_ids.push(*left);
}
if !has_right {
new_state_ids.push(*right);
}
} else if let Err(i) = self.state_ids.binary_search(&state_id) {
self.state_ids.insert(i, state_id);
}
i += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_successors() {
let table = [
(
vec![
(CharacterSet::empty().add_range('a', 'f'), 0, 1),
(CharacterSet::empty().add_range('d', 'i'), 1, 2),
],
vec![
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
(CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
],
),
(
vec![
(CharacterSet::empty().add_range('a', 'z'), 0, 1),
(CharacterSet::empty().add_char('d'), 0, 2),
(CharacterSet::empty().add_char('i'), 0, 3),
(CharacterSet::empty().add_char('f'), 0, 4),
],
vec![
(
CharacterSet::empty()
.add_range('a', 'c')
.add_char('e')
.add_range('g', 'h')
.add_range('j', 'z'),
0,
vec![1],
),
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
],
),
];
for row in table.iter() {
assert_eq!(
NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
row.1
);
}
// let successors = NfaCursor::group_successors(
// [
// (&CharacterSet::empty().add_range('a', 'f'), 1),
// (&CharacterSet::empty().add_range('d', 'i'), 2),
// ]
// .iter()
// .cloned(),
// );
//
// assert_eq!(
// successors,
// vec![
// (CharacterSet::empty().add_range('a', 'c'), vec![1],),
// (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
// (CharacterSet::empty().add_range('g', 'i'), vec![2],),
// ]
// );
}
#[test]
fn test_character_set_intersection() {
// whitelist - whitelist
// both sets contain 'c', 'd', and 'f'
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
// whitelist - blacklist
// both sets contain 'e', 'f', and 'm'
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
// blacklist - blacklist
// both sets exclude 'c', 'd', and 'e'
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Exclude(vec!['c', 'd', 'e'])
);
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
}
}