diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index ac5931e1..62295073 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,23 +1,26 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; +use std::fmt; -pub(crate) struct CoincidentTokenIndex { +pub(crate) struct CoincidentTokenIndex<'a> { entries: Vec>, + grammar: &'a LexicalGrammar, n: usize, } -impl CoincidentTokenIndex { - pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { +impl<'a> CoincidentTokenIndex<'a> { + pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self { let n = lexical_grammar.variables.len(); let mut result = Self { n, + grammar: lexical_grammar, entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - let index = result.index(*symbol, *other_symbol); + let index = result.index(symbol.index, other_symbol.index); if result.entries[index].last().cloned() != Some(i) { result.entries[index].push(i); } @@ -28,18 +31,41 @@ impl CoincidentTokenIndex { } pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { - &self.entries[self.index(a, b)] + &self.entries[self.index(a.index, b.index)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - !self.entries[self.index(a, b)].is_empty() + !self.entries[self.index(a.index, b.index)].is_empty() } - fn index(&self, a: Symbol, b: Symbol) -> usize { - if a.index < b.index { - a.index * self.n + b.index + fn index(&self, a: usize, b: usize) -> usize { + if a < b { + a * self.n + b } else { - b.index * self.n + a.index + b * self.n + a } } } + +impl<'a> fmt::Debug for CoincidentTokenIndex<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CoincidentTokenIndex {{\n")?; + + write!(f, " entries: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[j].name, + self.entries[self.index(i, j)].len() + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 207431dd..84659600 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -47,6 +47,7 @@ pub(crate) fn build_tables( syntax_grammar, simple_aliases, &token_conflict_map, + &keywords, ); let (main_lex_table, keyword_lex_table) = build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); @@ -67,15 +68,22 @@ fn populate_error_state( ) { let state = &mut parse_table.states[0]; let n = lexical_grammar.variables.len(); + + // First identify the *conflict-free tokens*: tokens that do not overlap with + // any other token in any way. let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { - let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { - j == i - || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - || !token_conflict_map.does_conflict(i, j) + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) }); if conflicts_with_other_tokens { None } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } })); @@ -85,19 +93,32 @@ fn populate_error_state( actions: vec![ParseAction::Recover], }; + // Exclude from the error-recovery state any token that conflicts with one of + // the *conflict-free tokens* identified above. for i in 0..n { let symbol = Symbol::terminal(i); - let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) - || conflict_free_tokens.iter().all(|t| { - coincident_token_index.contains(symbol, t) - || !token_conflict_map.does_conflict(i, t.index) - }); - if can_be_used_for_recovery { - state - .terminal_entries - .entry(symbol) - .or_insert_with(|| recover_entry.clone()); + if !conflict_free_tokens.contains(&symbol) { + if syntax_grammar.word_token != Some(symbol) { + if let Some(t) = conflict_free_tokens.iter().find(|t| { + !coincident_token_index.contains(symbol, *t) + && token_conflict_map.does_conflict(symbol.index, t.index) + }) { + info!( + "error recovery - exclude token {} because of conflict with {}", + lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name + ); + continue; + } + } } + info!( + "error recovery - include token {}", + lexical_grammar.variables[i].name + ); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); } for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { @@ -134,7 +155,10 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { - info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); + info!( + "Keywords - add candidate {}", + lexical_grammar.variables[i].name + ); Some(Symbol::terminal(i)) } else { None diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 33b72c32..64a4b259 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,3 +1,4 @@ +use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; @@ -9,265 +10,272 @@ pub(crate) fn shrink_parse_table( syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, + keywords: &LookaheadSet, ) { - remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); - merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); - remove_unused_states(parse_table); + let mut optimizer = Optimizer { + parse_table, + syntax_grammar, + token_conflict_map, + keywords, + simple_aliases, + }; + optimizer.remove_unit_reductions(); + optimizer.merge_compatible_states(); + optimizer.remove_unused_states(); } -fn remove_unit_reductions( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - simple_aliases: &AliasMap, -) { - let mut aliased_symbols = HashSet::new(); - for variable in &syntax_grammar.variables { - for production in &variable.productions { - for step in &production.steps { - if step.alias.is_some() { - aliased_symbols.insert(step.symbol); +struct Optimizer<'a> { + parse_table: &'a mut ParseTable, + syntax_grammar: &'a SyntaxGrammar, + token_conflict_map: &'a TokenConflictMap<'a>, + keywords: &'a LookaheadSet, + simple_aliases: &'a AliasMap, +} + +impl<'a> Optimizer<'a> { + fn remove_unit_reductions(&mut self) { + let mut aliased_symbols = HashSet::new(); + for variable in &self.syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } } } } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !self.simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && self.syntax_grammar.variables[symbol.index].kind + != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in self.parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } } - let mut unit_reduction_symbols_by_state = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - let mut only_unit_reductions = true; - let mut unit_reduction_symbol = None; - for (_, entry) in &state.terminal_entries { - for action in &entry.actions { - match action { - ParseAction::ShiftExtra => continue, - ParseAction::Reduce { - child_count: 1, - alias_sequence_id: 0, - symbol, - .. - } => { - if !simple_aliases.contains_key(&symbol) - && !aliased_symbols.contains(&symbol) - && syntax_grammar.variables[symbol.index].kind != VariableType::Named - && (unit_reduction_symbol.is_none() - || unit_reduction_symbol == Some(symbol)) - { - unit_reduction_symbol = Some(symbol); + fn merge_compatible_states(&mut self) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { continue; } + if self.merge_parse_state(*j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } } - _ => {} } - only_unit_reductions = false; + } + + if state_replacements.is_empty() { break; } - if !only_unit_reductions { - break; - } - } - - if let Some(symbol) = unit_reduction_symbol { - if only_unit_reductions { - unit_reduction_symbols_by_state.insert(i, *symbol); + for state in self.parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); } } } - for state in parse_table.states.iter_mut() { - let mut done = false; - while !done { - done = true; - state.update_referenced_states(|other_state_id, state| { - if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { - done = false; - state.nonterminal_entries[symbol] - } else { - other_state_id - } - }) - } - } -} + fn merge_parse_state(&mut self, left: usize, right: usize) -> bool { + let left_state = &self.parse_table.states[left]; + let right_state = &self.parse_table.states[right]; -fn merge_compatible_states( - parse_table: &mut ParseTable, - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, -) { - let mut state_ids_by_signature = HashMap::new(); - for (i, state) in parse_table.states.iter().enumerate() { - state_ids_by_signature - .entry(state.unfinished_item_signature) - .or_insert(Vec::new()) - .push(i); - } - - let mut deleted_states = HashSet::new(); - loop { - let mut state_replacements = HashMap::new(); - for (_, state_ids) in &state_ids_by_signature { - for i in state_ids { - for j in state_ids { - if j == i { - break; - } - if deleted_states.contains(j) || deleted_states.contains(i) { - continue; - } - if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { - deleted_states.insert(*i); - state_replacements.insert(*i, *j); - } - } - } - } - - if state_replacements.is_empty() { - break; - } - - for state in parse_table.states.iter_mut() { - state.update_referenced_states(|other_state_id, _| { - *state_replacements - .get(&other_state_id) - .unwrap_or(&other_state_id) - }); - } - } -} - -fn merge_parse_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - parse_table: &mut ParseTable, - left: usize, - right: usize, -) -> bool { - let left_state = &parse_table.states[left]; - let right_state = &parse_table.states[right]; - - if left_state.nonterminal_entries != right_state.nonterminal_entries { - return false; - } - - for (symbol, left_entry) in &left_state.terminal_entries { - if let Some(right_entry) = right_state.terminal_entries.get(symbol) { - if right_entry.actions != left_entry.actions { - return false; - } - } else if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - right_state, - *symbol, - left_entry, - ) { + if left_state.nonterminal_entries != right_state.nonterminal_entries { return false; } - } - let mut symbols_to_add = Vec::new(); - for (symbol, right_entry) in &right_state.terminal_entries { - if !left_state.terminal_entries.contains_key(&symbol) { - if !can_add_entry_to_state( - syntax_grammar, - token_conflict_map, - left_state, - *symbol, - right_entry, - ) { - return false; - } - symbols_to_add.push(*symbol); - } - } - - for symbol in symbols_to_add { - let entry = parse_table.states[right].terminal_entries[&symbol].clone(); - parse_table.states[left] - .terminal_entries - .insert(symbol, entry); - } - - true -} - -fn can_add_entry_to_state( - syntax_grammar: &SyntaxGrammar, - token_conflict_map: &TokenConflictMap, - state: &ParseState, - token: Symbol, - entry: &ParseTableEntry, -) -> bool { - // Do not add external tokens; they could conflict lexically with any of the state's - // existing lookahead tokens. - if token.is_external() { - return false; - } - - // Only merge parse states by allowing existing reductions to happen - // with additional lookahead tokens. Do not alter parse states in ways - // that allow entirely new types of actions to happen. - if state.terminal_entries.iter().all(|(_, e)| e != entry) { - return false; - } - match entry.actions.last() { - Some(ParseAction::Reduce { .. }) => {} - _ => return false, - } - - // Do not add tokens which are both internal and external. Their validity could - // influence the behavior of the external scanner. - if syntax_grammar - .external_tokens - .iter() - .any(|t| t.corresponding_internal_token == Some(token)) - { - return false; - } - - // Do not add a token if it conflicts with an existing token. - if token.is_terminal() { - for existing_token in state.terminal_entries.keys() { - if token_conflict_map.does_conflict(token.index, existing_token.index) { + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) { return false; } } + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !self.can_add_entry_to_state(left_state, *symbol, right_entry) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = self.parse_table.states[right].terminal_entries[&symbol].clone(); + self.parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true } - true -} - -fn remove_unused_states(parse_table: &mut ParseTable) { - let mut state_usage_map = vec![false; parse_table.states.len()]; - - state_usage_map[0] = true; - state_usage_map[1] = true; - - for state in &parse_table.states { - for referenced_state in state.referenced_states() { - state_usage_map[referenced_state] = true; + fn can_add_entry_to_state( + &self, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, + ) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; } + + // Only merge_compatible_states parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if self + .syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + let is_word_token = self.syntax_grammar.word_token == Some(token); + let is_keyword = self.keywords.contains(&token); + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if (is_word_token && self.keywords.contains(existing_token)) + || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token) + { + continue; + } + if self + .token_conflict_map + .does_conflict(token.index, existing_token.index) + || self + .token_conflict_map + .does_match_same_string(token.index, existing_token.index) + { + return false; + } + } + } + + true } - let mut removed_predecessor_count = 0; - let mut state_replacement_map = vec![0; parse_table.states.len()]; - for state_id in 0..parse_table.states.len() { - state_replacement_map[state_id] = state_id - removed_predecessor_count; - if !state_usage_map[state_id] { - removed_predecessor_count += 1; + + fn remove_unused_states(&mut self) { + let mut state_usage_map = vec![false; self.parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + + for state in &self.parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } } - } - let mut state_id = 0; - let mut original_state_id = 0; - while state_id < parse_table.states.len() { - if state_usage_map[original_state_id] { - parse_table.states[state_id].update_referenced_states(|other_state_id, _| { - state_replacement_map[other_state_id] - }); - state_id += 1; - } else { - parse_table.states.remove(state_id); + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; self.parse_table.states.len()]; + for state_id in 0..self.parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < self.parse_table.states.len() { + if state_usage_map[original_state_id] { + self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + self.parse_table.states.remove(state_id); + } + original_state_id += 1; } - original_state_id += 1; } }