From 605b50e58bf03661774ce7eb18f3b98dbd767ce3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 29 Dec 2018 13:57:34 -0800 Subject: [PATCH] Start work on shrinking parse table --- src/build_tables/build_parse_table.rs | 605 ++++++++++++++++++++++++ src/build_tables/mod.rs | 630 +------------------------ src/build_tables/shrink_parse_table.rs | 117 +++++ src/build_tables/token_conflict_map.rs | 77 +++ src/tables.rs | 56 ++- 5 files changed, 866 insertions(+), 619 deletions(-) create mode 100644 src/build_tables/build_parse_table.rs create mode 100644 src/build_tables/shrink_parse_table.rs create mode 100644 src/build_tables/token_conflict_map.rs diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs new file mode 100644 index 00000000..5087c55c --- /dev/null +++ b/src/build_tables/build_parse_table.rs @@ -0,0 +1,605 @@ +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::{ + AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} + +type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder<'a>, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, + parse_state_queue: VecDeque, + parse_table: ParseTable, +} + +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result { + // Ensure that the empty alias sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with( + [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + .iter() + .cloned(), + ), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + Ok(self.parse_table) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet<'a>, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + lex_state_id: 0, + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.inlines, + ); + + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet<'a>, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = item.symbol() { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_augmented() { + ParseAction::Accept + } else { + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + if let Some(step) = item.step() { + if item.step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = item.precedence(); + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index; + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index as usize) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index as usize].name + ) + .unwrap(); + + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item.is_done() { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(); + let associativity = item.associativity(); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol() == Some(symbol) { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let mut alias_sequence: Vec> = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + +pub(crate) fn build_parse_table( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + inlines: &InlinedProductionMap, +) -> Result { + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + inlines, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index fc17ce7f..a5ac74fb 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,607 +1,17 @@ +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseTable}; + +mod build_parse_table; mod item; mod item_set_builder; mod lex_table_builder; +mod shrink_parse_table; +mod token_conflict_map; -use self::item::{LookaheadSet, ParseItem, ParseItemSet}; -use self::item_set_builder::ParseItemSetBuilder; -use self::lex_table_builder::LexTableBuilder; -use crate::error::{Error, Result}; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::rules::Alias; -use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; -use crate::tables::{ - AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, -}; -use core::ops::Range; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet, VecDeque}; -use std::fmt::Write; - -#[derive(Clone)] -struct AuxiliarySymbolInfo { - auxiliary_symbol: Symbol, - parent_symbols: Vec, -} - -type SymbolSequence = Vec; -type AuxiliarySymbolSequence = Vec; - -struct ParseStateQueueEntry { - preceding_symbols: SymbolSequence, - preceding_auxiliary_symbols: AuxiliarySymbolSequence, - state_id: ParseStateId, -} - -struct ParseTableBuilder<'a> { - item_set_builder: ParseItemSetBuilder<'a>, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, - simple_aliases: &'a AliasMap, - state_ids_by_item_set: HashMap, ParseStateId>, - item_sets_by_state_id: Vec>, - parse_state_queue: VecDeque, - parse_table: ParseTable, -} - -impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty alias sequence has index 0. - self.parse_table.alias_sequences.push(Vec::new()); - - // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); - - self.add_parse_state( - &Vec::new(), - &Vec::new(), - ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] - .iter() - .cloned(), - ), - ); - - self.process_part_state_queue()?; - - let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); - - self.populate_used_symbols(); - - let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); - Ok(( - self.parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, - )) - } - - fn add_parse_state( - &mut self, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &AuxiliarySymbolSequence, - item_set: ParseItemSet<'a>, - ) -> ParseStateId { - match self.state_ids_by_item_set.entry(item_set) { - Entry::Occupied(o) => { - // eprintln!("Item set already processed at state {}", *o.get()); - *o.get() - } - Entry::Vacant(v) => { - // eprintln!("Item set not yet processed"); - let state_id = self.parse_table.states.len(); - self.item_sets_by_state_id.push(v.key().clone()); - self.parse_table.states.push(ParseState { - lex_state_id: 0, - terminal_entries: HashMap::new(), - nonterminal_entries: HashMap::new(), - }); - self.parse_state_queue.push_back(ParseStateQueueEntry { - state_id, - preceding_symbols: preceding_symbols.clone(), - preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), - }); - v.insert(state_id); - state_id - } - } - } - - fn process_part_state_queue(&mut self) -> Result<()> { - while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - - self.add_actions( - entry.preceding_symbols, - entry.preceding_auxiliary_symbols, - item_set, - entry.state_id, - )?; - } - Ok(()) - } - - fn add_actions( - &mut self, - mut preceding_symbols: SymbolSequence, - mut preceding_auxiliary_symbols: Vec, - item_set: ParseItemSet<'a>, - state_id: ParseStateId, - ) -> Result<()> { - let mut terminal_successors = HashMap::new(); - let mut non_terminal_successors = HashMap::new(); - let mut lookaheads_with_conflicts = HashSet::new(); - - for (item, lookaheads) in &item_set.entries { - if let Some(next_symbol) = item.symbol() { - let successor = item.successor(); - if next_symbol.is_non_terminal() { - // Keep track of where auxiliary non-terminals (repeat symbols) are - // used within visible symbols. This information may be needed later - // for conflict resolution. - if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { - preceding_auxiliary_symbols - .push(self.get_auxiliary_node_info(&item_set, next_symbol)); - } - - non_terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } else { - terminal_successors - .entry(next_symbol) - .or_insert_with(|| ParseItemSet::default()) - .entries - .entry(successor) - .or_insert_with(|| LookaheadSet::new()) - .insert_all(lookaheads); - } - } else { - let action = if item.is_augmented() { - ParseAction::Accept - } else { - ParseAction::Reduce { - symbol: Symbol::non_terminal(item.variable_index as usize), - child_count: item.step_index as usize, - precedence: item.precedence(), - associativity: item.associativity(), - dynamic_precedence: item.production.dynamic_precedence, - alias_sequence_id: self.get_alias_sequence_id(item), - } - }; - - for lookahead in lookaheads.iter() { - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(lookahead); - let entry = entry.or_insert_with(|| ParseTableEntry::new()); - if entry.actions.is_empty() { - entry.actions.push(action); - } else if action.precedence() > entry.actions[0].precedence() { - entry.actions.clear(); - entry.actions.push(action); - lookaheads_with_conflicts.remove(&lookahead); - } else if action.precedence() == entry.actions[0].precedence() { - entry.actions.push(action); - lookaheads_with_conflicts.insert(lookahead); - } - } - } - } - - for (symbol, next_item_set) in terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - - let entry = self.parse_table.states[state_id] - .terminal_entries - .entry(symbol); - if let Entry::Occupied(e) = &entry { - if !e.get().actions.is_empty() { - lookaheads_with_conflicts.insert(symbol); - } - } - - entry - .or_insert_with(|| ParseTableEntry::new()) - .actions - .push(ParseAction::Shift { - state: next_state_id, - is_repetition: false, - }); - } - - for (symbol, next_item_set) in non_terminal_successors { - preceding_symbols.push(symbol); - let next_state_id = self.add_parse_state( - &preceding_symbols, - &preceding_auxiliary_symbols, - next_item_set, - ); - preceding_symbols.pop(); - self.parse_table.states[state_id] - .nonterminal_entries - .insert(symbol, next_state_id); - } - - for symbol in lookaheads_with_conflicts { - self.handle_conflict( - &item_set, - state_id, - &preceding_symbols, - &preceding_auxiliary_symbols, - symbol, - )?; - } - - let state = &mut self.parse_table.states[state_id]; - for extra_token in &self.syntax_grammar.extra_tokens { - state - .terminal_entries - .entry(*extra_token) - .or_insert(ParseTableEntry { - reusable: true, - actions: vec![ParseAction::ShiftExtra], - }); - } - - Ok(()) - } - - fn handle_conflict( - &mut self, - item_set: &ParseItemSet, - state_id: ParseStateId, - preceding_symbols: &SymbolSequence, - preceding_auxiliary_symbols: &Vec, - conflicting_lookahead: Symbol, - ) -> Result<()> { - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - - // Determine which items in the set conflict with each other, and the - // precedences associated with SHIFT vs REDUCE actions. There won't - // be multiple REDUCE actions with different precedences; that is - // sorted out ahead of time in `add_actions`. But there can still be - // REDUCE-REDUCE conflicts where all actions have the *same* - // precedence, and there can still be SHIFT/REDUCE conflicts. - let reduce_precedence = entry.actions[0].precedence(); - let mut considered_associativity = false; - let mut shift_precedence: Option> = None; - let mut conflicting_items = HashSet::new(); - for (item, lookaheads) in &item_set.entries { - if let Some(step) = item.step() { - if item.step_index > 0 { - if self - .item_set_builder - .first_set(&step.symbol) - .contains(&conflicting_lookahead) - { - conflicting_items.insert(item); - let precedence = item.precedence(); - if let Some(range) = &mut shift_precedence { - if precedence < range.start { - range.start = precedence; - } else if precedence > range.end { - range.end = precedence; - } - } else { - shift_precedence = Some(precedence..precedence); - } - } - } - } else if lookaheads.contains(&conflicting_lookahead) { - conflicting_items.insert(item); - } - } - - if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { - let shift_precedence = shift_precedence.unwrap_or(0..0); - - // If all of the items in the conflict have the same parent symbol, - // and that parent symbols is auxiliary, then this is just the intentional - // ambiguity associated with a repeat rule. Resolve that class of ambiguity - // by leaving it in the parse table, but marking the SHIFT action with - // an `is_repetition` flag. - let conflicting_variable_index = - conflicting_items.iter().next().unwrap().variable_index; - if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { - if conflicting_items - .iter() - .all(|item| item.variable_index == conflicting_variable_index) - { - *is_repetition = true; - return Ok(()); - } - } - - // If the SHIFT action has higher precedence, remove all the REDUCE actions. - if shift_precedence.start > reduce_precedence - || (shift_precedence.start == reduce_precedence - && shift_precedence.end > reduce_precedence) - { - entry.actions.drain(0..entry.actions.len() - 1); - } - // If the REDUCE actions have higher precedence, remove the SHIFT action. - else if shift_precedence.end < reduce_precedence - || (shift_precedence.end == reduce_precedence - && shift_precedence.start < reduce_precedence) - { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - // If the SHIFT and REDUCE actions have the same predence, consider - // the REDUCE actions' associativity. - else if shift_precedence == (reduce_precedence..reduce_precedence) { - considered_associativity = true; - let mut has_left = false; - let mut has_right = false; - let mut has_non = false; - for action in &entry.actions { - if let ParseAction::Reduce { associativity, .. } = action { - match associativity { - Some(Associativity::Left) => has_left = true, - Some(Associativity::Right) => has_right = true, - None => has_non = true, - } - } - } - - // If all reduce actions are left associative, remove the SHIFT action. - // If all reduce actions are right associative, remove the REDUCE actions. - match (has_left, has_non, has_right) { - (true, false, false) => { - entry.actions.pop(); - conflicting_items.retain(|item| item.is_done()); - } - (false, false, true) => { - entry.actions.drain(0..entry.actions.len() - 1); - } - _ => {} - } - } - } - - // If all of the actions but one have been eliminated, then there's no problem. - let entry = self.parse_table.states[state_id] - .terminal_entries - .get_mut(&conflicting_lookahead) - .unwrap(); - if entry.actions.len() == 1 { - return Ok(()); - } - - // Determine the set of parent symbols involved in this conflict. - let mut actual_conflict = Vec::new(); - for item in &conflicting_items { - let symbol = Symbol::non_terminal(item.variable_index as usize); - if self.syntax_grammar.variables[symbol.index].is_auxiliary() { - actual_conflict.extend( - preceding_auxiliary_symbols - .iter() - .rev() - .find_map(|info| { - if info.auxiliary_symbol == symbol { - Some(&info.parent_symbols) - } else { - None - } - }) - .unwrap() - .iter(), - ); - } else { - actual_conflict.push(symbol); - } - } - actual_conflict.sort_unstable(); - actual_conflict.dedup(); - - // If this set of symbols has been whitelisted, then there's no error. - if self - .syntax_grammar - .expected_conflicts - .contains(&actual_conflict) - { - return Ok(()); - } - - let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); - for symbol in preceding_symbols { - write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); - } - - write!( - &mut msg, - " • {} …\n\n", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - write!(&mut msg, "Possible interpretations:\n").unwrap(); - for (i, item) in conflicting_items.iter().enumerate() { - write!(&mut msg, "\n {}:", i).unwrap(); - - for preceding_symbol in preceding_symbols - .iter() - .take(preceding_symbols.len() - item.step_index as usize) - { - write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); - } - - write!( - &mut msg, - " ({}", - &self.syntax_grammar.variables[item.variable_index as usize].name - ) - .unwrap(); - - for (j, step) in item.production.steps.iter().enumerate() { - if j as u32 == item.step_index { - write!(&mut msg, " •").unwrap(); - } - write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); - } - - write!(&mut msg, ")").unwrap(); - - if item.is_done() { - write!( - &mut msg, - " • {}", - self.symbol_name(&conflicting_lookahead) - ) - .unwrap(); - } - - let precedence = item.precedence(); - let associativity = item.associativity(); - if precedence != 0 || associativity.is_some() { - write!( - &mut msg, - "(precedence: {}, associativity: {:?})", - precedence, associativity - ) - .unwrap(); - } - } - - // TODO - generate suggested resolutions - - Err(Error::ConflictError(msg)) - } - - fn get_auxiliary_node_info( - &self, - item_set: &ParseItemSet, - symbol: Symbol, - ) -> AuxiliarySymbolInfo { - let parent_symbols = item_set - .entries - .keys() - .filter_map(|item| { - if item.symbol() == Some(symbol) { - None - } else { - None - } - }) - .collect(); - AuxiliarySymbolInfo { - auxiliary_symbol: symbol, - parent_symbols, - } - } - - fn populate_used_symbols(&mut self) { - let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; - let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; - let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; - for state in &self.parse_table.states { - for symbol in state.terminal_entries.keys() { - match symbol.kind { - SymbolType::Terminal => terminal_usages[symbol.index] = true, - SymbolType::External => external_usages[symbol.index] = true, - _ => {} - } - } - for symbol in state.nonterminal_entries.keys() { - non_terminal_usages[symbol.index] = true; - } - } - self.parse_table.symbols.push(Symbol::end()); - for (i, value) in terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::terminal(i)); - } - } - for (i, value) in non_terminal_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::non_terminal(i)); - } - } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); - } - } - } - - fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let mut alias_sequence: Vec> = item - .production - .steps - .iter() - .map(|s| s.alias.clone()) - .collect(); - while alias_sequence.last() == Some(&None) { - alias_sequence.pop(); - } - if let Some(index) = self - .parse_table - .alias_sequences - .iter() - .position(|seq| *seq == alias_sequence) - { - index - } else { - self.parse_table.alias_sequences.push(alias_sequence); - self.parse_table.alias_sequences.len() - 1 - } - } - - fn symbol_name(&self, symbol: &Symbol) -> String { - match symbol.kind { - SymbolType::End => "EOF".to_string(), - SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] - .name - .clone(), - SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), - SymbolType::Terminal => { - let variable = &self.lexical_grammar.variables[symbol.index]; - if variable.kind == VariableType::Named { - variable.name.clone() - } else { - format!("\"{}\"", &variable.name) - } - } - } - } -} +use self::build_parse_table::build_parse_table; +use self::shrink_parse_table::shrink_parse_table; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -609,20 +19,8 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, inlines: &InlinedProductionMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - ParseTableBuilder { - syntax_grammar, - lexical_grammar, - simple_aliases, - inlines, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), - state_ids_by_item_set: HashMap::new(), - item_sets_by_state_id: Vec::new(), - parse_state_queue: VecDeque::new(), - parse_table: ParseTable { - states: Vec::new(), - alias_sequences: Vec::new(), - symbols: Vec::new(), - }, - } - .build() + + let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; + shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + Ok((parse_table, LexTable::default(), LexTable::default(), None)) } diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs new file mode 100644 index 00000000..8e826f5c --- /dev/null +++ b/src/build_tables/shrink_parse_table.rs @@ -0,0 +1,117 @@ +use crate::grammars::{SyntaxGrammar, VariableType}; +use crate::rules::AliasMap; +use crate::tables::{ParseAction, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) fn shrink_parse_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + remove_unused_states(parse_table); +} + +fn remove_unit_reductions( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, +) { + let mut aliased_symbols = HashSet::new(); + for variable in &syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } + } + } + } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && syntax_grammar.variables[symbol.index].kind != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } +} + +fn remove_unused_states(parse_table: &mut ParseTable) { + let mut state_usage_map = vec![false; parse_table.states.len()]; + for state in &parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } + } + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; parse_table.states.len()]; + for state_id in 0..parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < parse_table.states.len() { + if state_usage_map[original_state_id] { + parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + parse_table.states.remove(state_id); + } + original_state_id += 1; + } +} diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflict_map.rs new file mode 100644 index 00000000..46a00986 --- /dev/null +++ b/src/build_tables/token_conflict_map.rs @@ -0,0 +1,77 @@ +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{CharacterSet, NfaCursor}; +use std::collections::HashSet; + +#[derive(Default)] +struct TokenConflictStatus { + matches_same_string: bool, + matches_longer_string_with_valid_next_char: bool, +} + +pub(crate) struct TokenConflictMap { + starting_chars_by_index: Vec, + status_matrix: Vec, +} + +impl TokenConflictMap { + pub fn new(grammar: &LexicalGrammar) -> Self { + let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + + let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _, _) in cursor.successors() { + all_chars = all_chars.add(chars); + } + starting_chars_by_index.push(all_chars); + } + + let status_matrix = + Vec::with_capacity(grammar.variables.len() * grammar.variables.len()); + + TokenConflictMap { + starting_chars_by_index, + status_matrix, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{Variable, VariableType}; + use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; + use crate::rules::Rule; + + #[test] + fn test_starting_characters() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "token_0".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("[a-f]1|0x\\d"), + }, + Variable { + name: "token_1".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("d*ef"), + }, + ], + }) + .unwrap(); + + let token_map = TokenConflictMap::new(&grammar); + + assert_eq!( + token_map.starting_chars_by_index[0], + CharacterSet::empty().add_range('a', 'f').add_char('0') + ); + assert_eq!( + token_map.starting_chars_by_index[1], + CharacterSet::empty().add_range('d', 'e') + ); + } +} diff --git a/src/tables.rs b/src/tables.rs index 01cecb49..0815aac8 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,7 @@ +use crate::nfa::CharacterSet; +use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; use std::ops::Range; -use crate::rules::{Associativity, Symbol, Alias}; -use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -23,7 +23,7 @@ pub(crate) enum ParseAction { dynamic_precedence: i32, associativity: Option, alias_sequence_id: AliasSequenceId, - } + }, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -86,6 +86,56 @@ impl Default for LexTable { } } +impl ParseState { + pub fn referenced_states<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_entries + .iter() + .flat_map(|(_, entry)| { + entry.actions.iter().filter_map(|action| match action { + ParseAction::Shift { state, .. } => Some(*state), + _ => None, + }) + }) + .chain(self.nonterminal_entries.iter().map(|(_, state)| *state)) + } + + pub fn update_referenced_states(&mut self, mut f: F) + where + F: FnMut(usize, &ParseState) -> usize, + { + let mut updates = Vec::new(); + for (symbol, entry) in &self.terminal_entries { + for (i, action) in entry.actions.iter().enumerate() { + if let ParseAction::Shift { state, .. } = action { + let result = f(*state, self); + if result != *state { + updates.push((*symbol, i, result)); + } + } + } + } + for (symbol, other_state) in &self.nonterminal_entries { + let result = f(*other_state, self); + if result != *other_state { + updates.push((*symbol, 0, result)); + } + } + for (symbol, action_index, new_state) in updates { + if symbol.is_non_terminal() { + self.nonterminal_entries.insert(symbol, new_state); + } else { + let entry = self.terminal_entries.get_mut(&symbol).unwrap(); + if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] { + entry.actions[action_index] = ParseAction::Shift { + state: new_state, + is_repetition, + }; + } + } + } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self {