From c6b9e97c5820bd2f24c42e58fd2e82944354a6b6 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Sun, 30 Dec 2018 19:31:17 -0800
Subject: [PATCH] Implement token conflict map

---
 src/build_tables/build_parse_table.rs         |  20 +-
 src/build_tables/item_set_builder.rs          |   4 +
 src/build_tables/mod.rs                       |   6 +-
 src/build_tables/token_conflict_map.rs        | 315 +++++++++++++++++-
 src/grammars.rs                               |   7 +
 src/nfa.rs                                    | 156 ++++++---
 src/prepare_grammar/expand_tokens.rs          |  40 ++-
 src/prepare_grammar/extract_simple_aliases.rs |   3 +
 8 files changed, 471 insertions(+), 80 deletions(-)
diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs
index 5087c55c..a7911689 100644
--- a/src/build_tables/build_parse_table.rs
+++ b/src/build_tables/build_parse_table.rs
@@ -2,7 +2,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet};
 use super::item_set_builder::ParseItemSetBuilder;
 use crate::error::{Error, Result};
 use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
-use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType};
+use crate::rules::{Alias, Associativity, Symbol, SymbolType};
 use crate::tables::{
     AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
 };
@@ -35,10 +35,11 @@ struct ParseTableBuilder<'a> {
     item_sets_by_state_id: Vec<ParseItemSet<'a>>,
     parse_state_queue: VecDeque<ParseStateQueueEntry>,
     parse_table: ParseTable,
+    following_tokens: Vec<LookaheadSet>,
 }
 
 impl<'a> ParseTableBuilder<'a> {
-    fn build(mut self) -> Result<ParseTable> {
+    fn build(mut self) -> Result<(ParseTable, Vec<LookaheadSet>)> {
         // Ensure that the empty alias sequence has index 0.
         self.parse_table.alias_sequences.push(Vec::new());
 
@@ -58,7 +59,7 @@ impl<'a> ParseTableBuilder<'a> {
 
         self.process_part_state_queue()?;
         self.populate_used_symbols();
-        Ok(self.parse_table)
+        Ok((self.parse_table, self.following_tokens))
     }
 
     fn add_parse_state(
@@ -67,6 +68,16 @@ impl<'a> ParseTableBuilder<'a> {
         preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
         item_set: ParseItemSet<'a>,
     ) -> ParseStateId {
+        if preceding_symbols.len() > 1 {
+            let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
+            let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
+            for left_token in left_tokens.iter() {
+                if left_token.is_terminal() {
+                    self.following_tokens[left_token.index].insert_all(right_tokens);
+                }
+            }
+        }
+
         match self.state_ids_by_item_set.entry(item_set) {
             Entry::Occupied(o) => *o.get(),
             Entry::Vacant(v) => {
@@ -586,7 +597,7 @@ pub(crate) fn build_parse_table(
     syntax_grammar: &SyntaxGrammar,
     lexical_grammar: &LexicalGrammar,
     inlines: &InlinedProductionMap,
-) -> Result<ParseTable> {
+) -> Result<(ParseTable, Vec<LookaheadSet>)> {
     ParseTableBuilder {
         syntax_grammar,
         lexical_grammar,
@@ -600,6 +611,7 @@ pub(crate) fn build_parse_table(
             alias_sequences: Vec::new(),
             symbols: Vec::new(),
         },
+        following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()],
     }
     .build()
 }
diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs
index d7883988..8649cb52 100644
--- a/src/build_tables/item_set_builder.rs
+++ b/src/build_tables/item_set_builder.rs
@@ -269,6 +269,10 @@ impl<'a> ParseItemSetBuilder<'a> {
         &self.first_sets[symbol]
     }
 
+    pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet {
+        &self.first_sets[symbol]
+    }
+
     fn add_item(
         &self,
         set: &mut ParseItemSet<'a>,
diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs
index a5ac74fb..d1983068 100644
--- a/src/build_tables/mod.rs
+++ b/src/build_tables/mod.rs
@@ -12,6 +12,7 @@ mod token_conflict_map;
 
 use self::build_parse_table::build_parse_table;
 use self::shrink_parse_table::shrink_parse_table;
+use self::token_conflict_map::TokenConflictMap;
 
 pub(crate) fn build_tables(
     syntax_grammar: &SyntaxGrammar,
@@ -19,8 +20,9 @@ pub(crate) fn build_tables(
     simple_aliases: &AliasMap,
     inlines: &InlinedProductionMap,
 ) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
-
-    let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
+    let (mut parse_table, following_tokens) =
+        build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
+    let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
     shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases);
     Ok((parse_table, LexTable::default(), LexTable::default(), None))
 }
diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs
index 46a00986..52c68cc7 100644
--- a/src/build_tables/token_conflict_map.rs
+++ b/src/build_tables/token_conflict_map.rs
@@ -1,40 +1,262 @@
-use crate::grammars::{LexicalGrammar, LexicalVariable};
+use crate::build_tables::item::LookaheadSet;
+use crate::grammars::LexicalGrammar;
 use crate::nfa::{CharacterSet, NfaCursor};
 use std::collections::HashSet;
+use std::fmt;
 
-#[derive(Default)]
+#[derive(Clone, Debug, Default)]
 struct TokenConflictStatus {
+    does_overlap: bool,
+    does_match_valid_continuation: bool,
     matches_same_string: bool,
-    matches_longer_string_with_valid_next_char: bool,
 }
 
 pub(crate) struct TokenConflictMap {
-    starting_chars_by_index: Vec<CharacterSet>,
+    n: usize,
     status_matrix: Vec<TokenConflictStatus>,
+    starting_chars_by_index: Vec<CharacterSet>,
+    following_chars_by_index: Vec<CharacterSet>,
 }
 
 impl TokenConflictMap {
-    pub fn new(grammar: &LexicalGrammar) -> Self {
+    pub fn new(grammar: &LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
         let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
+        let starting_chars = get_starting_chars(&mut cursor, grammar);
+        let following_chars = get_following_chars(&starting_chars, following_tokens);
 
-        let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len());
-        for variable in &grammar.variables {
-            cursor.reset(vec![variable.start_state]);
-            let mut all_chars = CharacterSet::empty();
-            for (chars, _, _) in cursor.successors() {
-                all_chars = all_chars.add(chars);
+        let n = grammar.variables.len();
+        let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
+        for i in 0..grammar.variables.len() {
+            for j in 0..i {
+                let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
+                status_matrix[matrix_index(n, i, j)] = status.0;
+                status_matrix[matrix_index(n, j, i)] = status.1;
             }
-            starting_chars_by_index.push(all_chars);
         }
 
-        let status_matrix =
-            Vec::with_capacity(grammar.variables.len() * grammar.variables.len());
-
         TokenConflictMap {
-            starting_chars_by_index,
+            n,
             status_matrix,
+            starting_chars_by_index: starting_chars,
+            following_chars_by_index: following_chars,
         }
     }
+
+    pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
+    }
+
+    pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation
+    }
+
+    pub fn does_overlap(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].does_overlap
+    }
+}
+
+impl fmt::Debug for TokenConflictMap {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "TokenConflictMap {{\n")?;
+
+        write!(f, "  starting_characters: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {:?},\n", i, self.starting_chars_by_index[i])?;
+        }
+        write!(f, "  }},\n")?;
+
+        write!(f, "  following_characters: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {:?},\n", i, self.following_chars_by_index[i])?;
+        }
+        write!(f, "  }},\n")?;
+
+        write!(f, "  status_matrix: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {{\n", i)?;
+            for j in 0..self.n {
+                write!(
+                    f,
+                    "      {}: {:?},\n",
+                    j,
+                    self.status_matrix[matrix_index(self.n, i, j)]
+                )?;
+            }
+            write!(f, "    }},\n")?;
+        }
+        write!(f, "  }},")?;
+        write!(f, "}}")?;
+        Ok(())
+    }
+}
+
+fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
+    variable_count * i + j
+}
+
+fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
+    let mut result = Vec::with_capacity(grammar.variables.len());
+    for variable in &grammar.variables {
+        cursor.reset(vec![variable.start_state]);
+        let mut all_chars = CharacterSet::empty();
+        for (chars, _, _) in cursor.successors() {
+            all_chars = all_chars.add(chars);
+        }
+        result.push(all_chars);
+    }
+    result
+}
+
+fn get_following_chars(
+    starting_chars: &Vec<CharacterSet>,
+    following_tokens: Vec<LookaheadSet>,
+) -> Vec<CharacterSet> {
+    following_tokens
+        .into_iter()
+        .map(|following_tokens| {
+            let mut chars = CharacterSet::empty();
+            for token in following_tokens.iter() {
+                if token.is_terminal() {
+                    chars = chars.add(&starting_chars[token.index]);
+                }
+            }
+            chars
+        })
+        .collect()
+}
+
+fn compute_conflict_status(
+    cursor: &mut NfaCursor,
+    grammar: &LexicalGrammar,
+    following_chars: &Vec<CharacterSet>,
+    i: usize,
+    j: usize,
+) -> (TokenConflictStatus, TokenConflictStatus) {
+    let mut visited_state_sets = HashSet::new();
+    let mut state_set_queue = vec![vec![
+        grammar.variables[i].start_state,
+        grammar.variables[j].start_state,
+    ]];
+    let mut result = (
+        TokenConflictStatus::default(),
+        TokenConflictStatus::default(),
+    );
+
+    while let Some(state_set) = state_set_queue.pop() {
+        // Don't pursue states where there's no potential for conflict.
+        if variable_ids_for_states(&state_set, grammar).count() > 1 {
+            cursor.reset(state_set);
+        } else {
+            continue;
+        }
+
+        let mut completion = None;
+        for (id, precedence) in cursor.completions() {
+            if let Some((prev_id, prev_precedence)) = completion {
+                if id == prev_id {
+                    continue;
+                }
+
+                // Prefer tokens with higher precedence. For tokens with equal precedence,
+                // prefer those listed earlier in the grammar.
+                let winning_id;
+                if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) {
+                    winning_id = prev_id;
+                } else {
+                    winning_id = id;
+                    completion = Some((id, precedence));
+                }
+
+                if winning_id == i {
+                    result.0.matches_same_string = true;
+                    result.0.does_overlap = true;
+                } else {
+                    result.1.matches_same_string = true;
+                    result.1.does_overlap = true;
+                }
+            } else {
+                completion = Some((id, precedence));
+            }
+        }
+
+        for (chars, advance_precedence, next_states) in cursor.grouped_successors() {
+            let mut can_advance = true;
+            if let Some((completed_id, completed_precedence)) = completion {
+                let mut other_id = None;
+                let mut successor_contains_completed_id = false;
+                for variable_id in variable_ids_for_states(&next_states, grammar) {
+                    if variable_id == completed_id {
+                        successor_contains_completed_id = true;
+                        break;
+                    } else {
+                        other_id = Some(variable_id);
+                    }
+                }
+
+                if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
+                    let winning_id;
+                    if advance_precedence < completed_precedence {
+                        winning_id = completed_id;
+                        can_advance = false;
+                    } else {
+                        winning_id = other_id;
+                    }
+
+                    if winning_id == i {
+                        result.0.does_overlap = true;
+                        if chars.does_intersect(&following_chars[j]) {
+                            result.0.does_match_valid_continuation = true;
+                        }
+                    } else {
+                        result.1.does_overlap = true;
+                        if chars.does_intersect(&following_chars[i]) {
+                            result.1.does_match_valid_continuation = true;
+                        }
+                    }
+                }
+            }
+
+            if can_advance && visited_state_sets.insert(next_states.clone()) {
+                state_set_queue.push(next_states);
+            }
+        }
+    }
+    result
+}
+
+fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
+    if left.0 > right.0 {
+        return true;
+    } else if left.0 < right.0 {
+        return false;
+    }
+
+    match (
+        grammar.variables[left.1].is_string,
+        grammar.variables[right.1].is_string,
+    ) {
+        (true, false) => return true,
+        (false, true) => return false,
+        _ => {}
+    }
+
+    left.0 < right.0
+}
+
+fn variable_ids_for_states<'a>(
+    state_ids: &'a Vec<u32>,
+    grammar: &'a LexicalGrammar,
+) -> impl Iterator<Item = usize> + 'a {
+    let mut prev = None;
+    state_ids.iter().filter_map(move |state_id| {
+        let variable_id = grammar.variable_index_for_nfa_state(*state_id);
+        if prev != Some(variable_id) {
+            prev = Some(variable_id);
+            prev
+        } else {
+            None
+        }
+    })
 }
 
 #[cfg(test)]
@@ -42,7 +264,7 @@ mod tests {
     use super::*;
     use crate::grammars::{Variable, VariableType};
     use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
-    use crate::rules::Rule;
+    use crate::rules::{Rule, Symbol};
 
     #[test]
     fn test_starting_characters() {
@@ -63,7 +285,7 @@ mod tests {
         })
         .unwrap();
 
-        let token_map = TokenConflictMap::new(&grammar);
+        let token_map = TokenConflictMap::new(&grammar, Vec::new());
 
         assert_eq!(
             token_map.starting_chars_by_index[0],
@@ -74,4 +296,61 @@ mod tests {
             CharacterSet::empty().add_range('d', 'e')
         );
     }
+
+    #[test]
+    fn test_token_conflicts() {
+        let grammar = expand_tokens(ExtractedLexicalGrammar {
+            separators: Vec::new(),
+            variables: vec![
+                Variable {
+                    name: "in".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::string("in"),
+                },
+                Variable {
+                    name: "identifier".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::pattern("\\w+"),
+                },
+                Variable {
+                    name: "instanceof".to_string(),
+                    kind: VariableType::Named,
+                    rule: Rule::string("instanceof"),
+                },
+            ],
+        })
+        .unwrap();
+
+        let var = |name| index_of_var(&grammar, name);
+
+        let token_map = TokenConflictMap::new(
+            &grammar,
+            vec![
+                LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
+                LookaheadSet::with(&[Symbol::terminal(var("in"))]),
+                LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
+            ],
+        );
+
+        // Given the string "in", the `in` token is preferred over the `identifier` token
+        assert!(token_map.does_match_same_string(var("in"), var("identifier")));
+        assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
+
+        // Depending on what character follows, the string "in" may be treated as part of an
+        // `identifier` token.
+        assert!(token_map.does_match_valid_continuation(var("identifier"), var("in")));
+
+        // Depending on what character follows, the string "instanceof" may be treated as part of
+        // an `identifier` token.
+        assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof")));
+        assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in")));
+    }
+
+    fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
+        grammar
+            .variables
+            .iter()
+            .position(|v| v.name == name)
+            .unwrap()
+    }
 }
diff --git a/src/grammars.rs b/src/grammars.rs
index b751e4e4..18da86d8 100644
--- a/src/grammars.rs
+++ b/src/grammars.rs
@@ -36,6 +36,7 @@ pub(crate) struct InputGrammar {
 pub(crate) struct LexicalVariable {
     pub name: String,
     pub kind: VariableType,
+    pub is_string: bool,
     pub start_state: u32,
 }
 
@@ -179,6 +180,12 @@ impl Variable {
     }
 }
 
+impl LexicalGrammar {
+    pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize {
+        self.variables.iter().position(|v| v.start_state >= state_id).unwrap()
+    }
+}
+
 impl SyntaxVariable {
     pub fn is_auxiliary(&self) -> bool {
         self.kind == VariableType::Auxiliary
diff --git a/src/nfa.rs b/src/nfa.rs
index 4a4fa17b..738d1b40 100644
--- a/src/nfa.rs
+++ b/src/nfa.rs
@@ -97,6 +97,19 @@ impl CharacterSet {
         panic!("Called add with a negated character set");
     }
 
+    pub fn does_intersect(&self, other: &CharacterSet) -> bool {
+        match self {
+            CharacterSet::Include(chars) => match other {
+                CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common,
+                CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only,
+            },
+            CharacterSet::Exclude(chars) => match other {
+                CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only,
+                CharacterSet::Exclude(_) => true,
+            },
+        }
+    }
+
     pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
         match self {
             CharacterSet::Include(chars) => match other {
@@ -152,14 +165,14 @@ impl Ord for CharacterSet {
         match self {
             CharacterSet::Include(chars) => {
                 if let CharacterSet::Include(other_chars) = other {
-                    compare_chars(chars, other_chars)
+                    order_chars(chars, other_chars)
                 } else {
                     Ordering::Less
                 }
             }
             CharacterSet::Exclude(chars) => {
                 if let CharacterSet::Exclude(other_chars) = other {
-                    compare_chars(chars, other_chars)
+                    order_chars(chars, other_chars)
                 } else {
                     Ordering::Greater
                 }
@@ -197,7 +210,39 @@ fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool)
     result
 }
 
-fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
+struct SetComparision {
+    left_only: bool,
+    common: bool,
+    right_only: bool,
+}
+
+fn compare_chars(left: &Vec<char>, right: &Vec<char>) -> SetComparision {
+    let mut result = SetComparision {
+        left_only: false,
+        common: false,
+        right_only: false,
+    };
+    let mut left = left.iter().cloned();
+    let mut right = right.iter().cloned();
+    let mut i = left.next();
+    let mut j = right.next();
+    while let (Some(left_char), Some(right_char)) = (i, j) {
+        if left_char < right_char {
+            i = left.next();
+            result.left_only = true;
+        } else if left_char > right_char {
+            j = right.next();
+            result.right_only = true;
+        } else {
+            i = left.next();
+            j = right.next();
+            result.common = true;
+        }
+    }
+    result
+}
+
+fn order_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
     if chars.is_empty() {
         if other_chars.is_empty() {
             Ordering::Equal
@@ -207,19 +252,15 @@ fn compare_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
     } else if other_chars.is_empty() {
         Ordering::Greater
     } else {
-        let mut other_c = other_chars.iter();
-        for c in chars.iter() {
-            if let Some(other_c) = other_c.next() {
-                let cmp = c.cmp(other_c);
-                if cmp != Ordering::Equal {
-                    return cmp;
-                }
-            } else {
-                return Ordering::Greater;
-            }
+        let cmp = chars.len().cmp(&other_chars.len());
+        if cmp != Ordering::Equal {
+            return cmp;
         }
-        if other_c.next().is_some() {
-            return Ordering::Less;
+        for (c, other_c) in chars.iter().zip(other_chars.iter()) {
+            let cmp = c.cmp(other_c);
+            if cmp != Ordering::Equal {
+                return cmp;
+            }
         }
         Ordering::Equal
     }
@@ -233,10 +274,6 @@ impl Nfa {
     pub fn last_state_id(&self) -> u32 {
         self.states.len() as u32 - 1
     }
-
-    pub fn prepend(&mut self, f: impl Fn(u32) -> NfaState) {
-        self.states.push(f(self.last_state_id()));
-    }
 }
 
 impl fmt::Debug for Nfa {
@@ -325,11 +362,17 @@ impl<'a> NfaCursor<'a> {
             while i < result.len() {
                 let intersection = result[i].0.remove_intersection(&mut chars);
                 if !intersection.is_empty() {
-                    let mut states = result[i].2.clone();
-                    let mut precedence = result[i].1;
-                    states.push(state);
-                    result.insert(i, (intersection, max(precedence, prec), states));
-                    i += 1;
+                    if result[i].0.is_empty() {
+                        result[i].0 = intersection;
+                        result[i].1 = max(result[i].1, prec);
+                        result[i].2.push(state);
+                    } else {
+                        let mut states = result[i].2.clone();
+                        let mut precedence = result[i].1;
+                        states.push(state);
+                        result.insert(i, (intersection, max(precedence, prec), states));
+                        i += 1;
+                    }
                 }
                 i += 1;
             }
@@ -341,27 +384,18 @@ impl<'a> NfaCursor<'a> {
         result
     }
 
-    pub fn finished_id(&self) -> Option<(usize, i32)> {
-        let mut result = None;
-        for state_id in self.state_ids.iter() {
+    pub fn completions(&self) -> impl Iterator<Item = (usize, i32)> + '_ {
+        self.state_ids.iter().filter_map(move |state_id| {
             if let NfaState::Accept {
                 variable_index,
                 precedence,
             } = self.nfa.states[*state_id as usize]
             {
-                match result {
-                    None => result = Some((variable_index, precedence)),
-                    Some((existing_id, existing_precedence)) => {
-                        if precedence > existing_precedence
-                            || (precedence == existing_precedence && variable_index < existing_id)
-                        {
-                            result = Some((variable_index, precedence))
-                        }
-                    }
-                }
+                Some((variable_index, precedence))
+            } else {
+                None
             }
-        }
-        result
+        })
     }
 
     pub fn in_separator(&self) -> bool {
@@ -467,7 +501,7 @@ mod tests {
     }
 
     #[test]
-    fn test_character_set_intersection() {
+    fn test_character_set_remove_intersection() {
         // whitelist - whitelist
         // both sets contain 'c', 'd', and 'f'
         let mut a = CharacterSet::empty().add_range('a', 'f');
@@ -529,4 +563,46 @@ mod tests {
         assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
         assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
     }
+
+    #[test]
+    fn test_character_set_does_intersect() {
+        let (a, b) = (CharacterSet::empty(), CharacterSet::empty());
+        assert!(!a.does_intersect(&b));
+        assert!(!b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::empty().add_char('a'),
+            CharacterSet::empty().add_char('a'),
+        );
+        assert!(a.does_intersect(&b));
+        assert!(b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::empty().add_char('b'),
+            CharacterSet::empty().add_char('a').add_char('c'),
+        );
+        assert!(!a.does_intersect(&b));
+        assert!(!b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::Include(vec!['b']),
+            CharacterSet::Exclude(vec!['a', 'b', 'c']),
+        );
+        assert!(!a.does_intersect(&b));
+        assert!(!b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::Include(vec!['b']),
+            CharacterSet::Exclude(vec!['a', 'c']),
+        );
+        assert!(a.does_intersect(&b));
+        assert!(b.does_intersect(&a));
+
+        let (a, b) = (
+            CharacterSet::Exclude(vec!['a']),
+            CharacterSet::Exclude(vec!['a']),
+        );
+        assert!(a.does_intersect(&b));
+        assert!(b.does_intersect(&a));
+    }
 }
diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs
index b0d2ae04..2b7e7b4d 100644
--- a/src/prepare_grammar/expand_tokens.rs
+++ b/src/prepare_grammar/expand_tokens.rs
@@ -13,6 +13,14 @@ struct NfaBuilder {
     precedence_stack: Vec<i32>,
 }
 
+fn is_string(rule: &Rule) -> bool {
+    match rule {
+        Rule::String(_) => true,
+        Rule::Metadata { rule, .. } => is_string(rule),
+        _ => false
+    }
+}
+
 pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
     let mut builder = NfaBuilder {
         nfa: Nfa::new(),
@@ -58,6 +66,7 @@ pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<Lexi
         variables.push(LexicalVariable {
             name: variable.name,
             kind: variable.kind,
+            is_string: is_string(&variable.rule),
             start_state: builder.nfa.last_state_id(),
         });
     }
@@ -94,9 +103,7 @@ impl NfaBuilder {
                 }
                 alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
                 for alternative_state_id in alternative_state_ids {
-                    self.nfa.prepend(|last_state_id| {
-                        NfaState::Split(last_state_id, alternative_state_id)
-                    });
+                    self.push_split(alternative_state_id);
                 }
                 Ok(true)
             }
@@ -218,9 +225,7 @@ impl NfaBuilder {
                 alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
 
                 for alternative_state_id in alternative_state_ids {
-                    self.nfa.prepend(|last_state_id| {
-                        NfaState::Split(last_state_id, alternative_state_id)
-                    });
+                    self.push_split(alternative_state_id);
                 }
                 Ok(true)
             }
@@ -255,8 +260,7 @@ impl NfaBuilder {
 
     fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
         if self.expand_regex(ast, next_state_id)? {
-            self.nfa
-                .prepend(|last_state_id| NfaState::Split(next_state_id, last_state_id));
+            self.push_split(next_state_id);
             Ok(true)
         } else {
             Ok(false)
@@ -265,8 +269,7 @@ impl NfaBuilder {
 
     fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
         if self.expand_one_or_more(&ast, next_state_id)? {
-            self.nfa
-                .prepend(|last_state_id| NfaState::Split(last_state_id, next_state_id));
+            self.push_split(next_state_id);
             Ok(true)
         } else {
             Ok(false)
@@ -333,6 +336,11 @@ impl NfaBuilder {
         });
     }
 
+    fn push_split(&mut self, state_id: u32) {
+        let last_state_id = self.nfa.last_state_id();
+        self.nfa.states.push(NfaState::Split(state_id, last_state_id));
+    }
+
     fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
         let mut i = 0;
         while i < state_ids.len() {
@@ -371,10 +379,10 @@ mod tests {
         let mut start_char = 0;
         let mut end_char = 0;
         for c in s.chars() {
-            if let Some((id, finished_precedence)) = cursor.finished_id() {
-                if result.is_none() || result_precedence <= finished_precedence {
+            for (id, precedence) in cursor.completions() {
+                if result.is_none() || result_precedence <= precedence {
                     result = Some((id, &s[start_char..end_char]));
-                    result_precedence = finished_precedence;
+                    result_precedence = precedence;
                 }
             }
             if cursor.advance(c) {
@@ -387,10 +395,10 @@ mod tests {
             }
         }
 
-        if let Some((id, finished_precedence)) = cursor.finished_id() {
-            if result.is_none() || result_precedence <= finished_precedence {
+        for (id, precedence) in cursor.completions() {
+            if result.is_none() || result_precedence <= precedence {
                 result = Some((id, &s[start_char..end_char]));
-                result_precedence = finished_precedence;
+                result_precedence = precedence;
             }
         }
 
diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs
index ff7204a0..ee748f5d 100644
--- a/src/prepare_grammar/extract_simple_aliases.rs
+++ b/src/prepare_grammar/extract_simple_aliases.rs
@@ -137,16 +137,19 @@ mod tests {
                 LexicalVariable {
                     name: "t1".to_string(),
                     kind: VariableType::Anonymous,
+                    is_string: true,
                     start_state: 0,
                 },
                 LexicalVariable {
                     name: "t2".to_string(),
                     kind: VariableType::Anonymous,
+                    is_string: true,
                     start_state: 0,
                 },
                 LexicalVariable {
                     name: "t3".to_string(),
                     kind: VariableType::Anonymous,
+                    is_string: true,
                     start_state: 0,
                 }
             ],