use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; use std::fmt; #[derive(Clone, Debug, Default)] struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, does_match_separators: bool, matches_same_string: bool, } pub(crate) struct TokenConflictMap { n: usize, status_matrix: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, } impl TokenConflictMap { pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); let n = grammar.variables.len(); let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; for i in 0..grammar.variables.len() { for j in 0..i { let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j); status_matrix[matrix_index(n, i, j)] = status.0; status_matrix[matrix_index(n, j, i)] = status.1; } } TokenConflictMap { n, status_matrix, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, } } pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } pub fn does_conflict(&self, i: usize, j: usize) -> bool { let entry = &self.status_matrix[matrix_index(self.n, i, j)]; entry.does_match_valid_continuation || entry.does_match_separators } pub fn does_overlap(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].does_overlap } } impl fmt::Debug for TokenConflictMap { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; write!(f, " starting_characters: {{\n")?; for i in 0..self.n { write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; } write!(f, " }},\n")?; write!(f, " following_characters: {{\n")?; for i in 0..self.n { write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; } write!(f, " }},\n")?; write!(f, " status_matrix: {{\n")?; for i in 0..self.n { write!(f, " {}: {{\n", i)?; for j in 0..self.n { write!( f, " {}: {:?},\n", j, self.status_matrix[matrix_index(self.n, i, j)] )?; } write!(f, " }},\n")?; } write!(f, " }},")?; write!(f, "}}")?; Ok(()) } } fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize { variable_count * i + j } fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec { let mut result = Vec::with_capacity(grammar.variables.len()); for variable in &grammar.variables { cursor.reset(vec![variable.start_state]); let mut all_chars = CharacterSet::empty(); for (chars, _, _) in cursor.successors() { all_chars = all_chars.add(chars); } result.push(all_chars); } result } fn get_following_chars( starting_chars: &Vec, following_tokens: Vec, ) -> Vec { following_tokens .into_iter() .map(|following_tokens| { let mut chars = CharacterSet::empty(); for token in following_tokens.iter() { if token.is_terminal() { chars = chars.add(&starting_chars[token.index]); } } chars }) .collect() } fn compute_conflict_status( cursor: &mut NfaCursor, grammar: &LexicalGrammar, following_chars: &Vec, i: usize, j: usize, ) -> (TokenConflictStatus, TokenConflictStatus) { let mut visited_state_sets = HashSet::new(); let mut state_set_queue = vec![vec![ grammar.variables[i].start_state, grammar.variables[j].start_state, ]]; let mut result = ( TokenConflictStatus::default(), TokenConflictStatus::default(), ); while let Some(state_set) = state_set_queue.pop() { // Don't pursue states where there's no potential for conflict. if variable_ids_for_states(&state_set, grammar).count() > 1 { cursor.reset(state_set); } else { continue; } let mut completion = None; for (id, precedence) in cursor.completions() { if let Some((prev_id, prev_precedence)) = completion { if id == prev_id { continue; } // Prefer tokens with higher precedence. For tokens with equal precedence, // prefer those listed earlier in the grammar. let winning_id; if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) { winning_id = prev_id; } else { winning_id = id; completion = Some((id, precedence)); } if winning_id == i { result.0.matches_same_string = true; result.0.does_overlap = true; } else { result.1.matches_same_string = true; result.1.does_overlap = true; } } else { completion = Some((id, precedence)); } } for (chars, advance_precedence, next_states) in cursor.grouped_successors() { let mut can_advance = true; if let Some((completed_id, completed_precedence)) = completion { let mut other_id = None; let mut successor_contains_completed_id = false; for variable_id in variable_ids_for_states(&next_states, grammar) { if variable_id == completed_id { successor_contains_completed_id = true; break; } else { other_id = Some(variable_id); } } if let (Some(other_id), false) = (other_id, successor_contains_completed_id) { let winning_id; if advance_precedence < completed_precedence { winning_id = completed_id; can_advance = false; } else { winning_id = other_id; } if winning_id == i { result.0.does_overlap = true; if chars.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } if cursor.in_separator() { result.0.does_match_separators = true; } } else { result.1.does_overlap = true; if chars.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; } else { result.1.does_match_separators = true; } } } } if can_advance && visited_state_sets.insert(next_states.clone()) { state_set_queue.push(next_states); } } } result } fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { if left.0 > right.0 { return true; } else if left.0 < right.0 { return false; } match ( grammar.variables[left.1].is_string, grammar.variables[right.1].is_string, ) { (true, false) => return true, (false, true) => return false, _ => {} } left.0 < right.0 } fn variable_ids_for_states<'a>( state_ids: &'a Vec, grammar: &'a LexicalGrammar, ) -> impl Iterator + 'a { let mut prev = None; state_ids.iter().filter_map(move |state_id| { let variable_id = grammar.variable_index_for_nfa_state(*state_id); if prev != Some(variable_id) { prev = Some(variable_id); prev } else { None } }) } #[cfg(test)] mod tests { use super::*; use crate::grammars::{Variable, VariableType}; use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; use crate::rules::{Rule, Symbol}; #[test] fn test_starting_characters() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: Vec::new(), variables: vec![ Variable { name: "token_0".to_string(), kind: VariableType::Named, rule: Rule::pattern("[a-f]1|0x\\d"), }, Variable { name: "token_1".to_string(), kind: VariableType::Named, rule: Rule::pattern("d*ef"), }, ], }) .unwrap(); let token_map = TokenConflictMap::new(&grammar, Vec::new()); assert_eq!( token_map.starting_chars_by_index[0], CharacterSet::empty().add_range('a', 'f').add_char('0') ); assert_eq!( token_map.starting_chars_by_index[1], CharacterSet::empty().add_range('d', 'e') ); } #[test] fn test_token_conflicts() { let grammar = expand_tokens(ExtractedLexicalGrammar { separators: Vec::new(), variables: vec![ Variable { name: "in".to_string(), kind: VariableType::Named, rule: Rule::string("in"), }, Variable { name: "identifier".to_string(), kind: VariableType::Named, rule: Rule::pattern("\\w+"), }, Variable { name: "instanceof".to_string(), kind: VariableType::Named, rule: Rule::string("instanceof"), }, ], }) .unwrap(); let var = |name| index_of_var(&grammar, name); let token_map = TokenConflictMap::new( &grammar, vec![ LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), ], ); // Given the string "in", the `in` token is preferred over the `identifier` token assert!(token_map.does_match_same_string(var("in"), var("identifier"))); assert!(!token_map.does_match_same_string(var("identifier"), var("in"))); // Depending on what character follows, the string "in" may be treated as part of an // `identifier` token. assert!(token_map.does_conflict(var("identifier"), var("in"))); // Depending on what character follows, the string "instanceof" may be treated as part of // an `identifier` token. assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); assert!(token_map.does_conflict(var("instanceof"), var("in"))); } fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { grammar .variables .iter() .position(|v| v.name == name) .unwrap() } }