use std::{cmp::Ordering, collections::HashSet, fmt}; use crate::{ build_tables::item::TokenSetDisplay, grammars::{LexicalGrammar, SyntaxGrammar}, nfa::{CharacterSet, NfaCursor, NfaTransition}, rules::TokenSet, }; #[derive(Clone, Debug, Default, PartialEq, Eq)] struct TokenConflictStatus { matches_prefix: bool, does_match_continuation: bool, does_match_valid_continuation: bool, does_match_separators: bool, matches_same_string: bool, matches_different_string: bool, } pub struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, following_tokens: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, grammar: &'a LexicalGrammar, } impl<'a> TokenConflictMap<'a> { /// Create a token conflict map based on a lexical grammar, which describes the structure /// each token, and a `following_token` map, which indicates which tokens may be appear /// immediately after each other token. /// /// This analyzes the possible kinds of overlap between each pair of tokens and stores /// them in a matrix. pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, &following_tokens); let n = grammar.variables.len(); let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; for i in 0..grammar.variables.len() { for j in 0..i { let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j); status_matrix[matrix_index(n, i, j)] = status.0; status_matrix[matrix_index(n, j, i)] = status.1; } } TokenConflictMap { n, status_matrix, following_tokens, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, grammar, } } /// Does token `i` match any strings that token `j` also matches, such that token `i` /// is preferred over token `j`? pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { let left = &self.status_matrix[matrix_index(self.n, a, other)]; let right = &self.status_matrix[matrix_index(self.n, b, other)]; left == right } /// Does token `i` match any strings that token `j` does *not* match? pub fn does_match_different_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_different_string } /// Does token `i` match any strings that token `j` also matches, where /// token `i` is preferred over token `j`? pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } pub fn does_conflict(&self, i: usize, j: usize) -> bool { let entry = &self.status_matrix[matrix_index(self.n, i, j)]; entry.does_match_valid_continuation || entry.does_match_separators || entry.matches_same_string } /// Does token `i` match any strings that are *prefixes* of strings matched by `j`? pub fn does_match_prefix(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_prefix } pub fn does_match_shorter_or_longer(&self, i: usize, j: usize) -> bool { let entry = &self.status_matrix[matrix_index(self.n, i, j)]; let reverse_entry = &self.status_matrix[matrix_index(self.n, j, i)]; (entry.does_match_valid_continuation || entry.does_match_separators) && !reverse_entry.does_match_separators } pub fn does_overlap(&self, i: usize, j: usize) -> bool { let status = &self.status_matrix[matrix_index(self.n, i, j)]; status.does_match_separators || status.matches_prefix || status.matches_same_string || status.does_match_continuation } pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { match left.0.cmp(&right.0) { Ordering::Less => false, Ordering::Greater => true, Ordering::Equal => match grammar.variables[left.1] .implicit_precedence .cmp(&grammar.variables[right.1].implicit_precedence) { Ordering::Less => false, Ordering::Greater => true, Ordering::Equal => left.1 < right.1, }, } } pub fn prefer_transition( grammar: &LexicalGrammar, t: &NfaTransition, completed_id: usize, completed_precedence: i32, has_separator_transitions: bool, ) -> bool { if t.precedence < completed_precedence { return false; } if t.precedence == completed_precedence { if t.is_separator { return false; } if has_separator_transitions && !grammar .variable_indices_for_nfa_states(&t.states) .any(|i| i == completed_id) { return false; } } true } } impl fmt::Debug for TokenConflictMap<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { writeln!(f, "TokenConflictMap {{")?; let syntax_grammar = SyntaxGrammar::default(); writeln!(f, " following_tokens: {{")?; for (i, following_tokens) in self.following_tokens.iter().enumerate() { writeln!( f, " follow({:?}): {},", self.grammar.variables[i].name, TokenSetDisplay(following_tokens, &syntax_grammar, self.grammar) )?; } writeln!(f, " }},")?; writeln!(f, " starting_characters: {{")?; for i in 0..self.n { writeln!( f, " {:?}: {:?},", self.grammar.variables[i].name, self.starting_chars_by_index[i] )?; } writeln!(f, " }},")?; writeln!(f, " following_characters: {{")?; for i in 0..self.n { writeln!( f, " {:?}: {:?},", self.grammar.variables[i].name, self.following_chars_by_index[i] )?; } writeln!(f, " }},")?; writeln!(f, " status_matrix: {{")?; for i in 0..self.n { writeln!(f, " {:?}: {{", self.grammar.variables[i].name)?; for j in 0..self.n { writeln!( f, " {:?}: {:?},", self.grammar.variables[j].name, self.status_matrix[matrix_index(self.n, i, j)] )?; } writeln!(f, " }},")?; } write!(f, " }},")?; write!(f, "}}")?; Ok(()) } } const fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize { variable_count * i + j } fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec { let mut result = Vec::with_capacity(grammar.variables.len()); for variable in &grammar.variables { cursor.reset(vec![variable.start_state]); let mut all_chars = CharacterSet::empty(); for (chars, _) in cursor.transition_chars() { all_chars = all_chars.add(chars); } result.push(all_chars); } result } fn get_following_chars( starting_chars: &[CharacterSet], following_tokens: &[TokenSet], ) -> Vec { following_tokens .iter() .map(|following_tokens| { let mut chars = CharacterSet::empty(); for token in following_tokens.iter() { if token.is_terminal() { chars = chars.add(&starting_chars[token.index]); } } chars }) .collect() } fn compute_conflict_status( cursor: &mut NfaCursor, grammar: &LexicalGrammar, following_chars: &[CharacterSet], i: usize, j: usize, ) -> (TokenConflictStatus, TokenConflictStatus) { let mut visited_state_sets = HashSet::new(); let mut state_set_queue = vec![vec![ grammar.variables[i].start_state, grammar.variables[j].start_state, ]]; let mut result = ( TokenConflictStatus::default(), TokenConflictStatus::default(), ); while let Some(state_set) = state_set_queue.pop() { let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set); // If only one of the two tokens could possibly match from this state, then // there is no reason to analyze any of its successors. Just record the fact // that the token matches a string that the other token does not match. let first_live_variable_index = live_variable_indices.next().unwrap(); if live_variable_indices.count() == 0 { if first_live_variable_index == i { result.0.matches_different_string = true; } else { result.1.matches_different_string = true; } continue; } // Don't pursue states where there's no potential for conflict. cursor.reset(state_set); let within_separator = cursor.transition_chars().any(|(_, sep)| sep); // Examine each possible completed token in this state. let mut completion = None; for (id, precedence) in cursor.completions() { if within_separator { if id == i { result.0.does_match_separators = true; } else { result.1.does_match_separators = true; } } // If the other token has already completed, then this is // a same-string conflict. if let Some((prev_id, prev_precedence)) = completion { if id == prev_id { continue; } // Determine which of the two tokens is preferred. let preferred_id; if TokenConflictMap::prefer_token( grammar, (prev_precedence, prev_id), (precedence, id), ) { preferred_id = prev_id; } else { preferred_id = id; completion = Some((id, precedence)); } if preferred_id == i { result.0.matches_same_string = true; } else { result.1.matches_same_string = true; } } else { completion = Some((id, precedence)); } } // Examine each possible transition from this state to detect substring conflicts. for transition in cursor.transitions() { let mut can_advance = true; // If there is already a completed token in this state, then determine // if the next state can also match the completed token. If so, then // this is *not* a conflict. if let Some((completed_id, completed_precedence)) = completion { let mut advanced_id = None; let mut successor_contains_completed_id = false; for variable_id in grammar.variable_indices_for_nfa_states(&transition.states) { if variable_id == completed_id { successor_contains_completed_id = true; break; } advanced_id = Some(variable_id); } // Determine which action is preferred: matching the already complete // token, or continuing on to try and match the other longer token. if let (Some(advanced_id), false) = (advanced_id, successor_contains_completed_id) { if TokenConflictMap::prefer_transition( grammar, &transition, completed_id, completed_precedence, within_separator, ) { can_advance = true; if advanced_id == i { result.0.does_match_continuation = true; if transition.characters.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } } else { result.1.does_match_continuation = true; if transition.characters.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; } } } else if completed_id == i { result.0.matches_prefix = true; } else { result.1.matches_prefix = true; } } } if can_advance && visited_state_sets.insert(transition.states.clone()) { state_set_queue.push(transition.states); } } } result } #[cfg(test)] mod tests { use super::*; use crate::{ grammars::{Variable, VariableType}, prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}, rules::{Precedence, Rule, Symbol}, }; #[test] fn test_starting_characters() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: Vec::new(), variables: vec![ Variable { name: "token_0".to_string(), kind: VariableType::Named, rule: Rule::pattern("[a-f]1|0x\\d", ""), }, Variable { name: "token_1".to_string(), kind: VariableType::Named, rule: Rule::pattern("d*ef", ""), }, ], }) .unwrap(); let token_map = TokenConflictMap::new(&grammar, Vec::new()); assert_eq!( token_map.starting_chars_by_index[0], CharacterSet::empty().add_range('a', 'f').add_char('0') ); assert_eq!( token_map.starting_chars_by_index[1], CharacterSet::empty().add_range('d', 'e') ); } #[test] fn test_token_conflicts() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: Vec::new(), variables: vec![ Variable { name: "in".to_string(), kind: VariableType::Named, rule: Rule::string("in"), }, Variable { name: "identifier".to_string(), kind: VariableType::Named, rule: Rule::pattern("\\w+", ""), }, Variable { name: "instanceof".to_string(), kind: VariableType::Named, rule: Rule::string("instanceof"), }, ], }) .unwrap(); let var = |name| index_of_var(&grammar, name); let token_map = TokenConflictMap::new( &grammar, vec![ std::iter::once(&Symbol::terminal(var("identifier"))) .copied() .collect(), std::iter::once(&Symbol::terminal(var("in"))) .copied() .collect(), std::iter::once(&Symbol::terminal(var("identifier"))) .copied() .collect(), ], ); // Given the string "in", the `in` token is preferred over the `identifier` token assert!(token_map.does_match_same_string(var("in"), var("identifier"))); assert!(!token_map.does_match_same_string(var("identifier"), var("in"))); // Depending on what character follows, the string "in" may be treated as part of an // `identifier` token. assert!(token_map.does_conflict(var("identifier"), var("in"))); // Depending on what character follows, the string "instanceof" may be treated as part of // an `identifier` token. assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); assert!(token_map.does_conflict(var("instanceof"), var("in"))); } #[test] fn test_token_conflicts_with_separators() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: vec![Rule::pattern("\\s", "")], variables: vec![ Variable { name: "x".to_string(), kind: VariableType::Named, rule: Rule::string("x"), }, Variable { name: "newline".to_string(), kind: VariableType::Named, rule: Rule::string("\n"), }, ], }) .unwrap(); let var = |name| index_of_var(&grammar, name); let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]); assert!(token_map.does_conflict(var("newline"), var("x"))); assert!(!token_map.does_conflict(var("x"), var("newline"))); } #[test] fn test_token_conflicts_with_open_ended_tokens() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: vec![Rule::pattern("\\s", "")], variables: vec![ Variable { name: "x".to_string(), kind: VariableType::Named, rule: Rule::string("x"), }, Variable { name: "anything".to_string(), kind: VariableType::Named, rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*", "")), }, ], }) .unwrap(); let var = |name| index_of_var(&grammar, name); let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]); assert!(token_map.does_match_shorter_or_longer(var("anything"), var("x"))); assert!(!token_map.does_match_shorter_or_longer(var("x"), var("anything"))); } fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { grammar .variables .iter() .position(|v| v.name == name) .unwrap() } }