Start work on shrinking parse table

This commit is contained in:
Max Brunsfeld 2018-12-29 13:57:34 -08:00
parent 479400e5d3
commit 605b50e58b
5 changed files with 866 additions and 619 deletions

View file

@ -0,0 +1,605 @@
use super::item::{LookaheadSet, ParseItem, ParseItemSet};
use super::item_set_builder::ParseItemSetBuilder;
use crate::error::{Error, Result};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::{Alias, AliasMap, Associativity, Symbol, SymbolType};
use crate::tables::{
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Write;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<ParseTable> {
// Ensure that the empty alias sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
// Ensure that the error state has index 0.
let error_state_id =
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]
.iter()
.cloned(),
),
);
self.process_part_state_queue()?;
self.populate_used_symbols();
Ok(self.parse_table)
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_table.states.push(ParseState {
lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn process_part_state_queue(&mut self) -> Result<()> {
while let Some(entry) = self.parse_state_queue.pop_front() {
let debug = false;
if debug {
println!(
"ITEM SET {}:\n{}",
entry.state_id,
self.item_sets_by_state_id[entry.state_id]
.display_with(&self.syntax_grammar, &self.lexical_grammar,)
);
}
let item_set = self.item_set_builder.transitive_closure(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
self.inlines,
);
if debug {
println!(
"TRANSITIVE CLOSURE:\n{}",
item_set.display_with(&self.syntax_grammar, &self.lexical_grammar)
);
}
self.add_actions(
entry.preceding_symbols,
entry.preceding_auxiliary_symbols,
item_set,
entry.state_id,
)?;
}
Ok(())
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
item_set: ParseItemSet<'a>,
state_id: ParseStateId,
) -> Result<()> {
let mut terminal_successors = HashMap::new();
let mut non_terminal_successors = HashMap::new();
let mut lookaheads_with_conflicts = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
}
} else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
ParseAction::Reduce {
symbol: Symbol::non_terminal(item.variable_index as usize),
child_count: item.step_index as usize,
precedence: item.precedence(),
associativity: item.associativity(),
dynamic_precedence: item.production.dynamic_precedence,
alias_sequence_id: self.get_alias_sequence_id(item),
}
};
for lookahead in lookaheads.iter() {
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
entry.actions.clear();
entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
} else if action.precedence() == entry.actions[0].precedence() {
entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
}
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
}
for symbol in lookaheads_with_conflicts {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
)?;
}
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let reduce_precedence = entry.actions[0].precedence();
let mut considered_associativity = false;
let mut shift_precedence: Option<Range<i32>> = None;
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(step) = item.step() {
if item.step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
conflicting_items.insert(item);
let precedence = item.precedence();
if let Some(range) = &mut shift_precedence {
if precedence < range.start {
range.start = precedence;
} else if precedence > range.end {
range.end = precedence;
}
} else {
shift_precedence = Some(precedence..precedence);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
conflicting_items.insert(item);
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
let shift_precedence = shift_precedence.unwrap_or(0..0);
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index;
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
if conflicting_items
.iter()
.all(|item| item.variable_index == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
if shift_precedence.start > reduce_precedence
|| (shift_precedence.start == reduce_precedence
&& shift_precedence.end > reduce_precedence)
{
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_precedence.end < reduce_precedence
|| (shift_precedence.end == reduce_precedence
&& shift_precedence.start < reduce_precedence)
{
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if shift_precedence == (reduce_precedence..reduce_precedence) {
considered_associativity = true;
let mut has_left = false;
let mut has_right = false;
let mut has_non = false;
for action in &entry.actions {
if let ParseAction::Reduce { associativity, .. } = action {
match associativity {
Some(Associativity::Left) => has_left = true,
Some(Associativity::Right) => has_right = true,
None => has_non = true,
}
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n").unwrap();
for (i, item) in conflicting_items.iter().enumerate() {
write!(&mut msg, "\n {}:", i).unwrap();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index as usize)
{
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut msg,
" ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
for (j, step) in item.production.steps.iter().enumerate() {
if j as u32 == item.step_index {
write!(&mut msg, "").unwrap();
}
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut msg, ")").unwrap();
if item.is_done() {
write!(
&mut msg,
" • {}",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
if precedence != 0 || associativity.is_some() {
write!(
&mut msg,
"(precedence: {}, associativity: {:?})",
precedence, associativity
)
.unwrap();
}
}
// TODO - generate suggested resolutions
Err(Error::ConflictError(msg))
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.keys()
.filter_map(|item| {
if item.symbol() == Some(symbol) {
None
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn populate_used_symbols(&mut self) {
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
for state in &self.parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
self.parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::terminal(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::non_terminal(i));
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::external(i));
}
}
}
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
let mut alias_sequence: Vec<Option<Alias>> = item
.production
.steps
.iter()
.map(|s| s.alias.clone())
.collect();
while alias_sequence.last() == Some(&None) {
alias_sequence.pop();
}
if let Some(index) = self
.parse_table
.alias_sequences
.iter()
.position(|seq| *seq == alias_sequence)
{
index
} else {
self.parse_table.alias_sequences.push(alias_sequence);
self.parse_table.alias_sequences.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("\"{}\"", &variable.name)
}
}
}
}
}
pub(crate) fn build_parse_table(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
) -> Result<ParseTable> {
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
inlines,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
alias_sequences: Vec::new(),
symbols: Vec::new(),
},
}
.build()
}

View file

@ -1,607 +1,17 @@
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseTable};
mod build_parse_table;
mod item;
mod item_set_builder;
mod lex_table_builder;
mod shrink_parse_table;
mod token_conflict_map;
use self::item::{LookaheadSet, ParseItem, ParseItemSet};
use self::item_set_builder::ParseItemSetBuilder;
use self::lex_table_builder::LexTableBuilder;
use crate::error::{Error, Result};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::Alias;
use crate::rules::{AliasMap, Associativity, Symbol, SymbolType};
use crate::tables::{
AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Write;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
simple_aliases: &'a AliasMap,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
// Ensure that the error state has index 0.
let error_state_id =
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]
.iter()
.cloned(),
),
);
self.process_part_state_queue()?;
let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar);
self.populate_used_symbols();
let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build();
Ok((
self.parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
))
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => {
// eprintln!("Item set already processed at state {}", *o.get());
*o.get()
}
Entry::Vacant(v) => {
// eprintln!("Item set not yet processed");
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_table.states.push(ParseState {
lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn process_part_state_queue(&mut self) -> Result<()> {
while let Some(entry) = self.parse_state_queue.pop_front() {
let debug = false;
if debug {
println!(
"ITEM SET {}:\n{}",
entry.state_id,
self.item_sets_by_state_id[entry.state_id]
.display_with(&self.syntax_grammar, &self.lexical_grammar,)
);
}
let item_set = self.item_set_builder.transitive_closure(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
self.inlines,
);
if debug {
println!(
"TRANSITIVE CLOSURE:\n{}",
item_set.display_with(&self.syntax_grammar, &self.lexical_grammar)
);
}
self.add_actions(
entry.preceding_symbols,
entry.preceding_auxiliary_symbols,
item_set,
entry.state_id,
)?;
}
Ok(())
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
item_set: ParseItemSet<'a>,
state_id: ParseStateId,
) -> Result<()> {
let mut terminal_successors = HashMap::new();
let mut non_terminal_successors = HashMap::new();
let mut lookaheads_with_conflicts = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
}
} else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
ParseAction::Reduce {
symbol: Symbol::non_terminal(item.variable_index as usize),
child_count: item.step_index as usize,
precedence: item.precedence(),
associativity: item.associativity(),
dynamic_precedence: item.production.dynamic_precedence,
alias_sequence_id: self.get_alias_sequence_id(item),
}
};
for lookahead in lookaheads.iter() {
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
entry.actions.clear();
entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
} else if action.precedence() == entry.actions[0].precedence() {
entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
}
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
}
for symbol in lookaheads_with_conflicts {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
)?;
}
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let reduce_precedence = entry.actions[0].precedence();
let mut considered_associativity = false;
let mut shift_precedence: Option<Range<i32>> = None;
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(step) = item.step() {
if item.step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
conflicting_items.insert(item);
let precedence = item.precedence();
if let Some(range) = &mut shift_precedence {
if precedence < range.start {
range.start = precedence;
} else if precedence > range.end {
range.end = precedence;
}
} else {
shift_precedence = Some(precedence..precedence);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
conflicting_items.insert(item);
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
let shift_precedence = shift_precedence.unwrap_or(0..0);
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index;
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
if conflicting_items
.iter()
.all(|item| item.variable_index == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
if shift_precedence.start > reduce_precedence
|| (shift_precedence.start == reduce_precedence
&& shift_precedence.end > reduce_precedence)
{
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_precedence.end < reduce_precedence
|| (shift_precedence.end == reduce_precedence
&& shift_precedence.start < reduce_precedence)
{
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if shift_precedence == (reduce_precedence..reduce_precedence) {
considered_associativity = true;
let mut has_left = false;
let mut has_right = false;
let mut has_non = false;
for action in &entry.actions {
if let ParseAction::Reduce { associativity, .. } = action {
match associativity {
Some(Associativity::Left) => has_left = true,
Some(Associativity::Right) => has_right = true,
None => has_non = true,
}
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n").unwrap();
for (i, item) in conflicting_items.iter().enumerate() {
write!(&mut msg, "\n {}:", i).unwrap();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index as usize)
{
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut msg,
" ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
for (j, step) in item.production.steps.iter().enumerate() {
if j as u32 == item.step_index {
write!(&mut msg, "").unwrap();
}
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut msg, ")").unwrap();
if item.is_done() {
write!(
&mut msg,
" • {}",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
if precedence != 0 || associativity.is_some() {
write!(
&mut msg,
"(precedence: {}, associativity: {:?})",
precedence, associativity
)
.unwrap();
}
}
// TODO - generate suggested resolutions
Err(Error::ConflictError(msg))
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.keys()
.filter_map(|item| {
if item.symbol() == Some(symbol) {
None
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn populate_used_symbols(&mut self) {
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
for state in &self.parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
self.parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::terminal(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::non_terminal(i));
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::external(i));
}
}
}
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
let mut alias_sequence: Vec<Option<Alias>> = item
.production
.steps
.iter()
.map(|s| s.alias.clone())
.collect();
while alias_sequence.last() == Some(&None) {
alias_sequence.pop();
}
if let Some(index) = self
.parse_table
.alias_sequences
.iter()
.position(|seq| *seq == alias_sequence)
{
index
} else {
self.parse_table.alias_sequences.push(alias_sequence);
self.parse_table.alias_sequences.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("\"{}\"", &variable.name)
}
}
}
}
}
use self::build_parse_table::build_parse_table;
use self::shrink_parse_table::shrink_parse_table;
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
@ -609,20 +19,8 @@ pub(crate) fn build_tables(
simple_aliases: &AliasMap,
inlines: &InlinedProductionMap,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
simple_aliases,
inlines,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
alias_sequences: Vec::new(),
symbols: Vec::new(),
},
}
.build()
let mut parse_table = build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases);
Ok((parse_table, LexTable::default(), LexTable::default(), None))
}

View file

@ -0,0 +1,117 @@
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::AliasMap;
use crate::tables::{ParseAction, ParseTable};
use std::collections::{HashMap, HashSet};
pub(crate) fn shrink_parse_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
) {
remove_unit_reductions(parse_table, syntax_grammar, simple_aliases);
remove_unused_states(parse_table);
}
fn remove_unit_reductions(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
) {
let mut aliased_symbols = HashSet::new();
for variable in &syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
}
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
alias_sequence_id: 0,
symbol,
..
} => {
if !simple_aliases.contains_key(&symbol)
&& !aliased_symbols.contains(&symbol)
&& syntax_grammar.variables[symbol.index].kind != VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
continue;
}
}
_ => {}
}
only_unit_reductions = false;
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
}
}
}
for state in parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
} else {
other_state_id
}
})
}
}
}
fn remove_unused_states(parse_table: &mut ParseTable) {
let mut state_usage_map = vec![false; parse_table.states.len()];
for state in &parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
}
}
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; parse_table.states.len()];
for state_id in 0..parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < parse_table.states.len() {
if state_usage_map[original_state_id] {
parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
parse_table.states.remove(state_id);
}
original_state_id += 1;
}
}

View file

@ -0,0 +1,77 @@
use crate::grammars::{LexicalGrammar, LexicalVariable};
use crate::nfa::{CharacterSet, NfaCursor};
use std::collections::HashSet;
#[derive(Default)]
struct TokenConflictStatus {
matches_same_string: bool,
matches_longer_string_with_valid_next_char: bool,
}
pub(crate) struct TokenConflictMap {
starting_chars_by_index: Vec<CharacterSet>,
status_matrix: Vec<TokenConflictStatus>,
}
impl TokenConflictMap {
pub fn new(grammar: &LexicalGrammar) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let mut starting_chars_by_index = Vec::with_capacity(grammar.variables.len());
for variable in &grammar.variables {
cursor.reset(vec![variable.start_state]);
let mut all_chars = CharacterSet::empty();
for (chars, _, _) in cursor.successors() {
all_chars = all_chars.add(chars);
}
starting_chars_by_index.push(all_chars);
}
let status_matrix =
Vec::with_capacity(grammar.variables.len() * grammar.variables.len());
TokenConflictMap {
starting_chars_by_index,
status_matrix,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{Variable, VariableType};
use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
use crate::rules::Rule;
#[test]
fn test_starting_characters() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "token_0".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[a-f]1|0x\\d"),
},
Variable {
name: "token_1".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("d*ef"),
},
],
})
.unwrap();
let token_map = TokenConflictMap::new(&grammar);
assert_eq!(
token_map.starting_chars_by_index[0],
CharacterSet::empty().add_range('a', 'f').add_char('0')
);
assert_eq!(
token_map.starting_chars_by_index[1],
CharacterSet::empty().add_range('d', 'e')
);
}
}

View file

@ -1,7 +1,7 @@
use crate::nfa::CharacterSet;
use crate::rules::{Alias, Associativity, Symbol};
use std::collections::HashMap;
use std::ops::Range;
use crate::rules::{Associativity, Symbol, Alias};
use crate::nfa::CharacterSet;
pub(crate) type AliasSequenceId = usize;
pub(crate) type ParseStateId = usize;
@ -23,7 +23,7 @@ pub(crate) enum ParseAction {
dynamic_precedence: i32,
associativity: Option<Associativity>,
alias_sequence_id: AliasSequenceId,
}
},
}
#[derive(Clone, Debug, PartialEq, Eq)]
@ -86,6 +86,56 @@ impl Default for LexTable {
}
}
impl ParseState {
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
self.terminal_entries
.iter()
.flat_map(|(_, entry)| {
entry.actions.iter().filter_map(|action| match action {
ParseAction::Shift { state, .. } => Some(*state),
_ => None,
})
})
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
where
F: FnMut(usize, &ParseState) -> usize,
{
let mut updates = Vec::new();
for (symbol, entry) in &self.terminal_entries {
for (i, action) in entry.actions.iter().enumerate() {
if let ParseAction::Shift { state, .. } = action {
let result = f(*state, self);
if result != *state {
updates.push((*symbol, i, result));
}
}
}
}
for (symbol, other_state) in &self.nonterminal_entries {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries.insert(symbol, new_state);
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
entry.actions[action_index] = ParseAction::Shift {
state: new_state,
is_repetition,
};
}
}
}
}
}
impl ParseAction {
pub fn precedence(&self) -> i32 {
if let ParseAction::Reduce { precedence, .. } = self {