Move code into cli directory
This commit is contained in:
parent
b8dd5d2640
commit
5b0e12ea33
29 changed files with 32 additions and 26 deletions
278
cli/src/build_tables/build_lex_table.rs
Normal file
278
cli/src/build_tables/build_lex_table.rs
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
use super::item::LookaheadSet;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
|
||||
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
|
||||
use crate::rules::Symbol;
|
||||
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
|
||||
pub(crate) fn build_lex_table(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
keywords: &LookaheadSet,
|
||||
minimize: bool,
|
||||
) -> (LexTable, LexTable) {
|
||||
let keyword_lex_table;
|
||||
if syntax_grammar.word_token.is_some() {
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
builder.add_state_for_tokens(keywords);
|
||||
keyword_lex_table = builder.table;
|
||||
} else {
|
||||
keyword_lex_table = LexTable::default();
|
||||
}
|
||||
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
for state in parse_table.states.iter_mut() {
|
||||
let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
if keywords.contains(&token) {
|
||||
syntax_grammar.word_token
|
||||
} else {
|
||||
Some(*token)
|
||||
}
|
||||
} else if token.is_eof() {
|
||||
Some(*token)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}));
|
||||
state.lex_state_id = builder.add_state_for_tokens(&tokens);
|
||||
}
|
||||
|
||||
let mut table = builder.table;
|
||||
|
||||
if minimize {
|
||||
minimize_lex_table(&mut table, parse_table);
|
||||
}
|
||||
|
||||
(table, keyword_lex_table)
|
||||
}
|
||||
|
||||
struct QueueEntry {
|
||||
state_id: usize,
|
||||
nfa_states: Vec<u32>,
|
||||
eof_valid: bool,
|
||||
}
|
||||
|
||||
struct LexTableBuilder<'a> {
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
cursor: NfaCursor<'a>,
|
||||
table: LexTable,
|
||||
state_queue: VecDeque<QueueEntry>,
|
||||
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
|
||||
}
|
||||
|
||||
impl<'a> LexTableBuilder<'a> {
|
||||
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
|
||||
Self {
|
||||
lexical_grammar,
|
||||
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
|
||||
table: LexTable::default(),
|
||||
state_queue: VecDeque::new(),
|
||||
state_ids_by_nfa_state_set: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize {
|
||||
let mut eof_valid = false;
|
||||
let nfa_states = tokens
|
||||
.iter()
|
||||
.filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
Some(self.lexical_grammar.variables[token.index].start_state)
|
||||
} else {
|
||||
eof_valid = true;
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
|
||||
|
||||
if is_new {
|
||||
info!(
|
||||
"entry point state: {}, tokens: {:?}",
|
||||
state_id,
|
||||
tokens
|
||||
.iter()
|
||||
.map(|t| &self.lexical_grammar.variables[t.index].name)
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
while let Some(QueueEntry {
|
||||
state_id,
|
||||
nfa_states,
|
||||
eof_valid,
|
||||
}) = self.state_queue.pop_front()
|
||||
{
|
||||
self.populate_state(state_id, nfa_states, eof_valid);
|
||||
}
|
||||
state_id
|
||||
}
|
||||
|
||||
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
|
||||
self.cursor.reset(nfa_states);
|
||||
match self
|
||||
.state_ids_by_nfa_state_set
|
||||
.entry((self.cursor.state_ids.clone(), eof_valid))
|
||||
{
|
||||
Entry::Occupied(o) => (*o.get(), false),
|
||||
Entry::Vacant(v) => {
|
||||
let state_id = self.table.states.len();
|
||||
self.table.states.push(LexState::default());
|
||||
self.state_queue.push_back(QueueEntry {
|
||||
state_id,
|
||||
nfa_states: v.key().0.clone(),
|
||||
eof_valid,
|
||||
});
|
||||
v.insert(state_id);
|
||||
(state_id, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
|
||||
self.cursor.force_reset(nfa_states);
|
||||
|
||||
// The EOF state is represented as an empty list of NFA states.
|
||||
let mut completion = None;
|
||||
for (id, prec) in self.cursor.completions() {
|
||||
if let Some((prev_id, prev_precedence)) = completion {
|
||||
if TokenConflictMap::prefer_token(
|
||||
self.lexical_grammar,
|
||||
(prev_precedence, prev_id),
|
||||
(prec, id),
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
completion = Some((id, prec));
|
||||
}
|
||||
|
||||
info!(
|
||||
"lex state: {}, completion: {:?}",
|
||||
state_id,
|
||||
completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec))
|
||||
);
|
||||
|
||||
let transitions = self.cursor.transitions();
|
||||
info!("lex state: {}, transitions: {:?}", state_id, transitions);
|
||||
|
||||
// If EOF is a valid lookahead token, add a transition predicated on the null
|
||||
// character that leads to the empty set of NFA states.
|
||||
if eof_valid {
|
||||
let (next_state_id, _) = self.add_state(Vec::new(), false);
|
||||
info!("lex state: {}, successor: EOF", state_id);
|
||||
self.table.states[state_id].advance_actions.push((
|
||||
CharacterSet::empty().add_char('\0'),
|
||||
AdvanceAction {
|
||||
state: Some(next_state_id),
|
||||
in_main_token: true,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
for NfaTransition {
|
||||
characters,
|
||||
precedence,
|
||||
states,
|
||||
is_separator,
|
||||
} in transitions
|
||||
{
|
||||
if let Some((_, completed_precedence)) = completion {
|
||||
if precedence < completed_precedence
|
||||
|| (precedence == completed_precedence && is_separator)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let (next_state_id, _) = self.add_state(states, eof_valid && is_separator);
|
||||
let next_state = if next_state_id == state_id {
|
||||
None
|
||||
} else {
|
||||
Some(next_state_id)
|
||||
};
|
||||
self.table.states[state_id].advance_actions.push((
|
||||
characters,
|
||||
AdvanceAction {
|
||||
state: next_state,
|
||||
in_main_token: !is_separator,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
if let Some((complete_id, _)) = completion {
|
||||
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
|
||||
} else if self.cursor.state_ids.is_empty() {
|
||||
self.table.states[state_id].accept_action = Some(Symbol::end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
|
||||
let mut state_replacements = BTreeMap::new();
|
||||
let mut done = false;
|
||||
while !done {
|
||||
done = true;
|
||||
for (i, state_i) in table.states.iter().enumerate() {
|
||||
if state_replacements.contains_key(&i) {
|
||||
continue;
|
||||
}
|
||||
for (j, state_j) in table.states.iter().enumerate() {
|
||||
if j == i {
|
||||
break;
|
||||
}
|
||||
if state_replacements.contains_key(&j) {
|
||||
continue;
|
||||
}
|
||||
if state_i == state_j {
|
||||
info!("replace state {} with state {}", i, j);
|
||||
state_replacements.insert(i, j);
|
||||
done = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for state in table.states.iter_mut() {
|
||||
for (_, advance_action) in state.advance_actions.iter_mut() {
|
||||
advance_action.state = advance_action
|
||||
.state
|
||||
.map(|s| state_replacements.get(&s).cloned().unwrap_or(s))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let final_state_replacements = (0..table.states.len())
|
||||
.into_iter()
|
||||
.map(|state_id| {
|
||||
let replacement = state_replacements
|
||||
.get(&state_id)
|
||||
.cloned()
|
||||
.unwrap_or(state_id);
|
||||
let prior_removed = state_replacements
|
||||
.iter()
|
||||
.take_while(|i| *i.0 < replacement)
|
||||
.count();
|
||||
replacement - prior_removed
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for state in parse_table.states.iter_mut() {
|
||||
state.lex_state_id = final_state_replacements[state.lex_state_id];
|
||||
}
|
||||
|
||||
for state in table.states.iter_mut() {
|
||||
for (_, advance_action) in state.advance_actions.iter_mut() {
|
||||
advance_action.state = advance_action.state.map(|s| final_state_replacements[s]);
|
||||
}
|
||||
}
|
||||
|
||||
let mut i = 0;
|
||||
table.states.retain(|_| {
|
||||
let result = !state_replacements.contains_key(&i);
|
||||
i += 1;
|
||||
result
|
||||
});
|
||||
}
|
||||
735
cli/src/build_tables/build_parse_table.rs
Normal file
735
cli/src/build_tables/build_parse_table.rs
Normal file
|
|
@ -0,0 +1,735 @@
|
|||
use super::item::{LookaheadSet, ParseItem, ParseItemSet};
|
||||
use super::item_set_builder::ParseItemSetBuilder;
|
||||
use crate::error::{Error, Result};
|
||||
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
|
||||
use crate::rules::{Alias, Associativity, Symbol, SymbolType};
|
||||
use crate::tables::{
|
||||
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
|
||||
};
|
||||
use core::ops::Range;
|
||||
use hashbrown::hash_map::Entry;
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use std::fmt::Write;
|
||||
use std::hash::Hasher;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct AuxiliarySymbolInfo {
|
||||
auxiliary_symbol: Symbol,
|
||||
parent_symbols: Vec<Symbol>,
|
||||
}
|
||||
|
||||
type SymbolSequence = Vec<Symbol>;
|
||||
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
|
||||
|
||||
struct ParseStateQueueEntry {
|
||||
preceding_symbols: SymbolSequence,
|
||||
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
|
||||
state_id: ParseStateId,
|
||||
}
|
||||
|
||||
struct ParseTableBuilder<'a> {
|
||||
item_set_builder: ParseItemSetBuilder<'a>,
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
|
||||
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
|
||||
parse_state_queue: VecDeque<ParseStateQueueEntry>,
|
||||
parse_table: ParseTable,
|
||||
following_tokens: Vec<LookaheadSet>,
|
||||
state_ids_to_log: Vec<ParseStateId>,
|
||||
}
|
||||
|
||||
impl<'a> ParseTableBuilder<'a> {
|
||||
fn build(mut self) -> Result<(ParseTable, Vec<LookaheadSet>)> {
|
||||
// Ensure that the empty alias sequence has index 0.
|
||||
self.parse_table.alias_sequences.push(Vec::new());
|
||||
|
||||
// Add the error state at index 0.
|
||||
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
|
||||
|
||||
// Add the starting state at index 1.
|
||||
self.add_parse_state(
|
||||
&Vec::new(),
|
||||
&Vec::new(),
|
||||
ParseItemSet::with(
|
||||
[(
|
||||
ParseItem::start(),
|
||||
LookaheadSet::with([Symbol::end()].iter().cloned()),
|
||||
)]
|
||||
.iter()
|
||||
.cloned(),
|
||||
),
|
||||
);
|
||||
|
||||
while let Some(entry) = self.parse_state_queue.pop_front() {
|
||||
let item_set = self
|
||||
.item_set_builder
|
||||
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
|
||||
|
||||
if self.state_ids_to_log.contains(&entry.state_id) {
|
||||
eprintln!(
|
||||
"state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}",
|
||||
entry.state_id,
|
||||
super::item::ParseItemSetDisplay(
|
||||
&self.item_sets_by_state_id[entry.state_id],
|
||||
self.syntax_grammar,
|
||||
self.lexical_grammar,
|
||||
),
|
||||
super::item::ParseItemSetDisplay(
|
||||
&item_set,
|
||||
self.syntax_grammar,
|
||||
self.lexical_grammar,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
self.add_actions(
|
||||
entry.preceding_symbols,
|
||||
entry.preceding_auxiliary_symbols,
|
||||
entry.state_id,
|
||||
item_set,
|
||||
)?;
|
||||
}
|
||||
|
||||
self.populate_used_symbols();
|
||||
self.remove_precedences();
|
||||
|
||||
Ok((self.parse_table, self.following_tokens))
|
||||
}
|
||||
|
||||
fn add_parse_state(
|
||||
&mut self,
|
||||
preceding_symbols: &SymbolSequence,
|
||||
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
|
||||
item_set: ParseItemSet<'a>,
|
||||
) -> ParseStateId {
|
||||
if preceding_symbols.len() > 1 {
|
||||
let left_tokens = self
|
||||
.item_set_builder
|
||||
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
|
||||
let right_tokens = self
|
||||
.item_set_builder
|
||||
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
|
||||
for left_token in left_tokens.iter() {
|
||||
if left_token.is_terminal() {
|
||||
self.following_tokens[left_token.index].insert_all(right_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
item_set.hash_unfinished_items(&mut hasher);
|
||||
let unfinished_item_signature = hasher.finish();
|
||||
|
||||
match self.state_ids_by_item_set.entry(item_set) {
|
||||
Entry::Occupied(o) => *o.get(),
|
||||
Entry::Vacant(v) => {
|
||||
let state_id = self.parse_table.states.len();
|
||||
self.item_sets_by_state_id.push(v.key().clone());
|
||||
self.parse_table.states.push(ParseState {
|
||||
lex_state_id: 0,
|
||||
terminal_entries: HashMap::new(),
|
||||
nonterminal_entries: HashMap::new(),
|
||||
unfinished_item_signature,
|
||||
});
|
||||
self.parse_state_queue.push_back(ParseStateQueueEntry {
|
||||
state_id,
|
||||
preceding_symbols: preceding_symbols.clone(),
|
||||
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
|
||||
});
|
||||
v.insert(state_id);
|
||||
state_id
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_actions(
|
||||
&mut self,
|
||||
mut preceding_symbols: SymbolSequence,
|
||||
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
|
||||
state_id: ParseStateId,
|
||||
item_set: ParseItemSet<'a>,
|
||||
) -> Result<()> {
|
||||
let mut terminal_successors = HashMap::new();
|
||||
let mut non_terminal_successors = HashMap::new();
|
||||
let mut lookaheads_with_conflicts = HashSet::new();
|
||||
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
if let Some(next_symbol) = item.symbol() {
|
||||
let successor = item.successor();
|
||||
if next_symbol.is_non_terminal() {
|
||||
// Keep track of where auxiliary non-terminals (repeat symbols) are
|
||||
// used within visible symbols. This information may be needed later
|
||||
// for conflict resolution.
|
||||
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
|
||||
preceding_auxiliary_symbols
|
||||
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
|
||||
}
|
||||
|
||||
non_terminal_successors
|
||||
.entry(next_symbol)
|
||||
.or_insert_with(|| ParseItemSet::default())
|
||||
.entries
|
||||
.entry(successor)
|
||||
.or_insert_with(|| LookaheadSet::new())
|
||||
.insert_all(lookaheads);
|
||||
} else {
|
||||
terminal_successors
|
||||
.entry(next_symbol)
|
||||
.or_insert_with(|| ParseItemSet::default())
|
||||
.entries
|
||||
.entry(successor)
|
||||
.or_insert_with(|| LookaheadSet::new())
|
||||
.insert_all(lookaheads);
|
||||
}
|
||||
} else {
|
||||
let action = if item.is_augmented() {
|
||||
ParseAction::Accept
|
||||
} else {
|
||||
ParseAction::Reduce {
|
||||
symbol: Symbol::non_terminal(item.variable_index as usize),
|
||||
child_count: item.step_index as usize,
|
||||
precedence: item.precedence(),
|
||||
associativity: item.associativity(),
|
||||
dynamic_precedence: item.production.dynamic_precedence,
|
||||
alias_sequence_id: self.get_alias_sequence_id(item),
|
||||
}
|
||||
};
|
||||
|
||||
for lookahead in lookaheads.iter() {
|
||||
let entry = self.parse_table.states[state_id]
|
||||
.terminal_entries
|
||||
.entry(lookahead);
|
||||
let entry = entry.or_insert_with(|| ParseTableEntry::new());
|
||||
if entry.actions.is_empty() {
|
||||
entry.actions.push(action);
|
||||
} else if action.precedence() > entry.actions[0].precedence() {
|
||||
entry.actions.clear();
|
||||
entry.actions.push(action);
|
||||
lookaheads_with_conflicts.remove(&lookahead);
|
||||
} else if action.precedence() == entry.actions[0].precedence() {
|
||||
entry.actions.push(action);
|
||||
lookaheads_with_conflicts.insert(lookahead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (symbol, next_item_set) in terminal_successors {
|
||||
preceding_symbols.push(symbol);
|
||||
let next_state_id = self.add_parse_state(
|
||||
&preceding_symbols,
|
||||
&preceding_auxiliary_symbols,
|
||||
next_item_set,
|
||||
);
|
||||
preceding_symbols.pop();
|
||||
|
||||
let entry = self.parse_table.states[state_id]
|
||||
.terminal_entries
|
||||
.entry(symbol);
|
||||
if let Entry::Occupied(e) = &entry {
|
||||
if !e.get().actions.is_empty() {
|
||||
lookaheads_with_conflicts.insert(symbol);
|
||||
}
|
||||
}
|
||||
|
||||
entry
|
||||
.or_insert_with(|| ParseTableEntry::new())
|
||||
.actions
|
||||
.push(ParseAction::Shift {
|
||||
state: next_state_id,
|
||||
is_repetition: false,
|
||||
});
|
||||
}
|
||||
|
||||
for (symbol, next_item_set) in non_terminal_successors {
|
||||
preceding_symbols.push(symbol);
|
||||
let next_state_id = self.add_parse_state(
|
||||
&preceding_symbols,
|
||||
&preceding_auxiliary_symbols,
|
||||
next_item_set,
|
||||
);
|
||||
preceding_symbols.pop();
|
||||
self.parse_table.states[state_id]
|
||||
.nonterminal_entries
|
||||
.insert(symbol, next_state_id);
|
||||
}
|
||||
|
||||
for symbol in lookaheads_with_conflicts {
|
||||
self.handle_conflict(
|
||||
&item_set,
|
||||
state_id,
|
||||
&preceding_symbols,
|
||||
&preceding_auxiliary_symbols,
|
||||
symbol,
|
||||
)?;
|
||||
}
|
||||
|
||||
let state = &mut self.parse_table.states[state_id];
|
||||
for extra_token in &self.syntax_grammar.extra_tokens {
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(*extra_token)
|
||||
.or_insert(ParseTableEntry {
|
||||
reusable: true,
|
||||
actions: vec![ParseAction::ShiftExtra],
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_conflict(
|
||||
&mut self,
|
||||
item_set: &ParseItemSet,
|
||||
state_id: ParseStateId,
|
||||
preceding_symbols: &SymbolSequence,
|
||||
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
|
||||
conflicting_lookahead: Symbol,
|
||||
) -> Result<()> {
|
||||
let entry = self.parse_table.states[state_id]
|
||||
.terminal_entries
|
||||
.get_mut(&conflicting_lookahead)
|
||||
.unwrap();
|
||||
|
||||
// Determine which items in the set conflict with each other, and the
|
||||
// precedences associated with SHIFT vs REDUCE actions. There won't
|
||||
// be multiple REDUCE actions with different precedences; that is
|
||||
// sorted out ahead of time in `add_actions`. But there can still be
|
||||
// REDUCE-REDUCE conflicts where all actions have the *same*
|
||||
// precedence, and there can still be SHIFT/REDUCE conflicts.
|
||||
let reduce_precedence = entry.actions[0].precedence();
|
||||
let mut considered_associativity = false;
|
||||
let mut shift_precedence: Option<Range<i32>> = None;
|
||||
let mut conflicting_items = HashSet::new();
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
if let Some(step) = item.step() {
|
||||
if item.step_index > 0 {
|
||||
if self
|
||||
.item_set_builder
|
||||
.first_set(&step.symbol)
|
||||
.contains(&conflicting_lookahead)
|
||||
{
|
||||
conflicting_items.insert(item);
|
||||
let precedence = item.precedence();
|
||||
if let Some(range) = &mut shift_precedence {
|
||||
if precedence < range.start {
|
||||
range.start = precedence;
|
||||
} else if precedence > range.end {
|
||||
range.end = precedence;
|
||||
}
|
||||
} else {
|
||||
shift_precedence = Some(precedence..precedence);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if lookaheads.contains(&conflicting_lookahead) {
|
||||
conflicting_items.insert(item);
|
||||
}
|
||||
}
|
||||
|
||||
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
|
||||
let shift_precedence = shift_precedence.unwrap_or(0..0);
|
||||
|
||||
// If all of the items in the conflict have the same parent symbol,
|
||||
// and that parent symbols is auxiliary, then this is just the intentional
|
||||
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
|
||||
// by leaving it in the parse table, but marking the SHIFT action with
|
||||
// an `is_repetition` flag.
|
||||
let conflicting_variable_index =
|
||||
conflicting_items.iter().next().unwrap().variable_index;
|
||||
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
|
||||
if conflicting_items
|
||||
.iter()
|
||||
.all(|item| item.variable_index == conflicting_variable_index)
|
||||
{
|
||||
*is_repetition = true;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
|
||||
if shift_precedence.start > reduce_precedence
|
||||
|| (shift_precedence.start == reduce_precedence
|
||||
&& shift_precedence.end > reduce_precedence)
|
||||
{
|
||||
entry.actions.drain(0..entry.actions.len() - 1);
|
||||
}
|
||||
// If the REDUCE actions have higher precedence, remove the SHIFT action.
|
||||
else if shift_precedence.end < reduce_precedence
|
||||
|| (shift_precedence.end == reduce_precedence
|
||||
&& shift_precedence.start < reduce_precedence)
|
||||
{
|
||||
entry.actions.pop();
|
||||
conflicting_items.retain(|item| item.is_done());
|
||||
}
|
||||
// If the SHIFT and REDUCE actions have the same predence, consider
|
||||
// the REDUCE actions' associativity.
|
||||
else if shift_precedence == (reduce_precedence..reduce_precedence) {
|
||||
considered_associativity = true;
|
||||
let mut has_left = false;
|
||||
let mut has_right = false;
|
||||
let mut has_non = false;
|
||||
for action in &entry.actions {
|
||||
if let ParseAction::Reduce { associativity, .. } = action {
|
||||
match associativity {
|
||||
Some(Associativity::Left) => has_left = true,
|
||||
Some(Associativity::Right) => has_right = true,
|
||||
None => has_non = true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If all reduce actions are left associative, remove the SHIFT action.
|
||||
// If all reduce actions are right associative, remove the REDUCE actions.
|
||||
match (has_left, has_non, has_right) {
|
||||
(true, false, false) => {
|
||||
entry.actions.pop();
|
||||
conflicting_items.retain(|item| item.is_done());
|
||||
}
|
||||
(false, false, true) => {
|
||||
entry.actions.drain(0..entry.actions.len() - 1);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If all of the actions but one have been eliminated, then there's no problem.
|
||||
let entry = self.parse_table.states[state_id]
|
||||
.terminal_entries
|
||||
.get_mut(&conflicting_lookahead)
|
||||
.unwrap();
|
||||
if entry.actions.len() == 1 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Determine the set of parent symbols involved in this conflict.
|
||||
let mut actual_conflict = Vec::new();
|
||||
for item in &conflicting_items {
|
||||
let symbol = Symbol::non_terminal(item.variable_index as usize);
|
||||
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
|
||||
actual_conflict.extend(
|
||||
preceding_auxiliary_symbols
|
||||
.iter()
|
||||
.rev()
|
||||
.find_map(|info| {
|
||||
if info.auxiliary_symbol == symbol {
|
||||
Some(&info.parent_symbols)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.unwrap()
|
||||
.iter(),
|
||||
);
|
||||
} else {
|
||||
actual_conflict.push(symbol);
|
||||
}
|
||||
}
|
||||
actual_conflict.sort_unstable();
|
||||
actual_conflict.dedup();
|
||||
|
||||
// If this set of symbols has been whitelisted, then there's no error.
|
||||
if self
|
||||
.syntax_grammar
|
||||
.expected_conflicts
|
||||
.contains(&actual_conflict)
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
|
||||
for symbol in preceding_symbols {
|
||||
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
|
||||
}
|
||||
|
||||
write!(
|
||||
&mut msg,
|
||||
" • {} …\n\n",
|
||||
self.symbol_name(&conflicting_lookahead)
|
||||
)
|
||||
.unwrap();
|
||||
write!(&mut msg, "Possible interpretations:\n\n").unwrap();
|
||||
for (i, item) in conflicting_items.iter().enumerate() {
|
||||
write!(&mut msg, " {}:", i + 1).unwrap();
|
||||
|
||||
for preceding_symbol in preceding_symbols
|
||||
.iter()
|
||||
.take(preceding_symbols.len() - item.step_index as usize)
|
||||
{
|
||||
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
|
||||
}
|
||||
|
||||
write!(
|
||||
&mut msg,
|
||||
" ({}",
|
||||
&self.syntax_grammar.variables[item.variable_index as usize].name
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
for (j, step) in item.production.steps.iter().enumerate() {
|
||||
if j as u32 == item.step_index {
|
||||
write!(&mut msg, " •").unwrap();
|
||||
}
|
||||
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
|
||||
}
|
||||
|
||||
write!(&mut msg, ")").unwrap();
|
||||
|
||||
if item.is_done() {
|
||||
write!(
|
||||
&mut msg,
|
||||
" • {}",
|
||||
self.symbol_name(&conflicting_lookahead)
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let precedence = item.precedence();
|
||||
let associativity = item.associativity();
|
||||
if precedence != 0 || associativity.is_some() {
|
||||
write!(
|
||||
&mut msg,
|
||||
"(precedence: {}, associativity: {:?})",
|
||||
precedence, associativity
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
write!(&mut msg, "\n").unwrap();
|
||||
}
|
||||
|
||||
let mut resolution_count = 0;
|
||||
write!(&mut msg, "\nPossible resolutions:\n\n").unwrap();
|
||||
let shift_items = conflicting_items
|
||||
.iter()
|
||||
.filter(|i| !i.is_done())
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
if shift_items.len() > 0 {
|
||||
resolution_count += 1;
|
||||
write!(
|
||||
&mut msg,
|
||||
" {}: Specify a higher precedence in",
|
||||
resolution_count
|
||||
)
|
||||
.unwrap();
|
||||
for (i, item) in shift_items.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(&mut msg, " and").unwrap();
|
||||
}
|
||||
write!(
|
||||
&mut msg,
|
||||
" `{}`",
|
||||
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
write!(&mut msg, " than in the other rules.\n").unwrap();
|
||||
}
|
||||
|
||||
if considered_associativity {
|
||||
resolution_count += 1;
|
||||
write!(
|
||||
&mut msg,
|
||||
" {}: Specify a left or right associativity in ",
|
||||
resolution_count
|
||||
)
|
||||
.unwrap();
|
||||
for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() {
|
||||
if i > 0 {
|
||||
write!(&mut msg, " and ").unwrap();
|
||||
}
|
||||
write!(
|
||||
&mut msg,
|
||||
"{}",
|
||||
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
write!(&mut msg, "\n").unwrap();
|
||||
}
|
||||
|
||||
for item in &conflicting_items {
|
||||
if item.is_done() {
|
||||
resolution_count += 1;
|
||||
write!(
|
||||
&mut msg,
|
||||
" {}: Specify a higher precedence in `{}` than in the other rules.\n",
|
||||
resolution_count,
|
||||
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
resolution_count += 1;
|
||||
write!(
|
||||
&mut msg,
|
||||
" {}: Add a conflict for these rules: ",
|
||||
resolution_count
|
||||
)
|
||||
.unwrap();
|
||||
for (i, symbol) in actual_conflict.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(&mut msg, ", ").unwrap();
|
||||
}
|
||||
write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap();
|
||||
}
|
||||
write!(&mut msg, "\n").unwrap();
|
||||
|
||||
Err(Error(msg))
|
||||
}
|
||||
|
||||
fn get_auxiliary_node_info(
|
||||
&self,
|
||||
item_set: &ParseItemSet,
|
||||
symbol: Symbol,
|
||||
) -> AuxiliarySymbolInfo {
|
||||
let parent_symbols = item_set
|
||||
.entries
|
||||
.keys()
|
||||
.filter_map(|item| {
|
||||
let variable_index = item.variable_index as usize;
|
||||
if item.symbol() == Some(symbol)
|
||||
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
|
||||
{
|
||||
Some(Symbol::non_terminal(variable_index))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
AuxiliarySymbolInfo {
|
||||
auxiliary_symbol: symbol,
|
||||
parent_symbols,
|
||||
}
|
||||
}
|
||||
|
||||
fn populate_used_symbols(&mut self) {
|
||||
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
|
||||
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
|
||||
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
|
||||
for state in &self.parse_table.states {
|
||||
for symbol in state.terminal_entries.keys() {
|
||||
match symbol.kind {
|
||||
SymbolType::Terminal => terminal_usages[symbol.index] = true,
|
||||
SymbolType::External => external_usages[symbol.index] = true,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
for symbol in state.nonterminal_entries.keys() {
|
||||
non_terminal_usages[symbol.index] = true;
|
||||
}
|
||||
}
|
||||
for (i, value) in external_usages.into_iter().enumerate() {
|
||||
if value {
|
||||
self.parse_table.symbols.push(Symbol::external(i));
|
||||
}
|
||||
}
|
||||
self.parse_table.symbols.push(Symbol::end());
|
||||
for (i, value) in terminal_usages.into_iter().enumerate() {
|
||||
if value {
|
||||
self.parse_table.symbols.push(Symbol::terminal(i));
|
||||
}
|
||||
}
|
||||
for (i, value) in non_terminal_usages.into_iter().enumerate() {
|
||||
if value {
|
||||
self.parse_table.symbols.push(Symbol::non_terminal(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_precedences(&mut self) {
|
||||
for state in self.parse_table.states.iter_mut() {
|
||||
for (_, entry) in state.terminal_entries.iter_mut() {
|
||||
for action in entry.actions.iter_mut() {
|
||||
match action {
|
||||
ParseAction::Reduce {
|
||||
precedence,
|
||||
associativity,
|
||||
..
|
||||
} => {
|
||||
*precedence = 0;
|
||||
*associativity = None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
|
||||
let mut alias_sequence: Vec<Option<Alias>> = item
|
||||
.production
|
||||
.steps
|
||||
.iter()
|
||||
.map(|s| s.alias.clone())
|
||||
.collect();
|
||||
while alias_sequence.last() == Some(&None) {
|
||||
alias_sequence.pop();
|
||||
}
|
||||
if item.production.steps.len() > self.parse_table.max_aliased_production_length {
|
||||
self.parse_table.max_aliased_production_length = item.production.steps.len()
|
||||
}
|
||||
if let Some(index) = self
|
||||
.parse_table
|
||||
.alias_sequences
|
||||
.iter()
|
||||
.position(|seq| *seq == alias_sequence)
|
||||
{
|
||||
index
|
||||
} else {
|
||||
self.parse_table.alias_sequences.push(alias_sequence);
|
||||
self.parse_table.alias_sequences.len() - 1
|
||||
}
|
||||
}
|
||||
|
||||
fn symbol_name(&self, symbol: &Symbol) -> String {
|
||||
match symbol.kind {
|
||||
SymbolType::End => "EOF".to_string(),
|
||||
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
|
||||
.name
|
||||
.clone(),
|
||||
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
|
||||
SymbolType::Terminal => {
|
||||
let variable = &self.lexical_grammar.variables[symbol.index];
|
||||
if variable.kind == VariableType::Named {
|
||||
variable.name.clone()
|
||||
} else {
|
||||
format!("\"{}\"", &variable.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build_parse_table(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
inlines: &InlinedProductionMap,
|
||||
state_ids_to_log: Vec<usize>,
|
||||
) -> Result<(ParseTable, Vec<LookaheadSet>)> {
|
||||
ParseTableBuilder {
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
state_ids_to_log,
|
||||
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
|
||||
state_ids_by_item_set: HashMap::new(),
|
||||
item_sets_by_state_id: Vec::new(),
|
||||
parse_state_queue: VecDeque::new(),
|
||||
parse_table: ParseTable {
|
||||
states: Vec::new(),
|
||||
symbols: Vec::new(),
|
||||
alias_sequences: Vec::new(),
|
||||
max_aliased_production_length: 0,
|
||||
},
|
||||
following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()],
|
||||
}
|
||||
.build()
|
||||
}
|
||||
71
cli/src/build_tables/coincident_tokens.rs
Normal file
71
cli/src/build_tables/coincident_tokens.rs
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
use crate::grammars::LexicalGrammar;
|
||||
use crate::rules::Symbol;
|
||||
use crate::tables::{ParseStateId, ParseTable};
|
||||
use std::fmt;
|
||||
|
||||
pub(crate) struct CoincidentTokenIndex<'a> {
|
||||
entries: Vec<Vec<ParseStateId>>,
|
||||
grammar: &'a LexicalGrammar,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl<'a> CoincidentTokenIndex<'a> {
|
||||
pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
|
||||
let n = lexical_grammar.variables.len();
|
||||
let mut result = Self {
|
||||
n,
|
||||
grammar: lexical_grammar,
|
||||
entries: vec![Vec::new(); n * n],
|
||||
};
|
||||
for (i, state) in table.states.iter().enumerate() {
|
||||
for symbol in state.terminal_entries.keys() {
|
||||
for other_symbol in state.terminal_entries.keys() {
|
||||
let index = result.index(symbol.index, other_symbol.index);
|
||||
if result.entries[index].last().cloned() != Some(i) {
|
||||
result.entries[index].push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
|
||||
&self.entries[self.index(a.index, b.index)]
|
||||
}
|
||||
|
||||
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
|
||||
!self.entries[self.index(a.index, b.index)].is_empty()
|
||||
}
|
||||
|
||||
fn index(&self, a: usize, b: usize) -> usize {
|
||||
if a < b {
|
||||
a * self.n + b
|
||||
} else {
|
||||
b * self.n + a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "CoincidentTokenIndex {{\n")?;
|
||||
|
||||
write!(f, " entries: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
|
||||
for j in 0..self.n {
|
||||
write!(
|
||||
f,
|
||||
" {}: {:?},\n",
|
||||
self.grammar.variables[j].name,
|
||||
self.entries[self.index(i, j)].len()
|
||||
)?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
}
|
||||
write!(f, " }},")?;
|
||||
write!(f, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
446
cli/src/build_tables/item.rs
Normal file
446
cli/src/build_tables/item.rs
Normal file
|
|
@ -0,0 +1,446 @@
|
|||
use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
|
||||
use crate::rules::Associativity;
|
||||
use crate::rules::{Symbol, SymbolType};
|
||||
use smallbitvec::SmallBitVec;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::u32;
|
||||
|
||||
lazy_static! {
|
||||
static ref START_PRODUCTION: Production = Production {
|
||||
dynamic_precedence: 0,
|
||||
steps: vec![ProductionStep {
|
||||
symbol: Symbol {
|
||||
index: 0,
|
||||
kind: SymbolType::NonTerminal,
|
||||
},
|
||||
precedence: 0,
|
||||
associativity: None,
|
||||
alias: None,
|
||||
}],
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub(crate) struct LookaheadSet {
|
||||
terminal_bits: SmallBitVec,
|
||||
external_bits: SmallBitVec,
|
||||
eof: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(crate) struct ParseItem<'a> {
|
||||
pub variable_index: u32,
|
||||
pub step_index: u32,
|
||||
pub production: &'a Production,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct ParseItemSet<'a> {
|
||||
pub entries: BTreeMap<ParseItem<'a>, LookaheadSet>,
|
||||
}
|
||||
|
||||
pub(crate) struct ParseItemDisplay<'a>(
|
||||
pub &'a ParseItem<'a>,
|
||||
pub &'a SyntaxGrammar,
|
||||
pub &'a LexicalGrammar,
|
||||
);
|
||||
|
||||
pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar);
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) struct ParseItemSetDisplay<'a>(
|
||||
pub &'a ParseItemSet<'a>,
|
||||
pub &'a SyntaxGrammar,
|
||||
pub &'a LexicalGrammar,
|
||||
);
|
||||
|
||||
impl LookaheadSet {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
terminal_bits: SmallBitVec::new(),
|
||||
external_bits: SmallBitVec::new(),
|
||||
eof: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
|
||||
self.terminal_bits
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, value)| {
|
||||
if value {
|
||||
Some(Symbol::terminal(i))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.chain(
|
||||
self.external_bits
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, value)| {
|
||||
if value {
|
||||
Some(Symbol::external(i))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}),
|
||||
)
|
||||
.chain(if self.eof { Some(Symbol::end()) } else { None })
|
||||
}
|
||||
|
||||
pub fn with(symbols: impl IntoIterator<Item = Symbol>) -> Self {
|
||||
let mut result = Self::new();
|
||||
for symbol in symbols {
|
||||
result.insert(symbol);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn contains(&self, symbol: &Symbol) -> bool {
|
||||
match symbol.kind {
|
||||
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
|
||||
SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false),
|
||||
SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false),
|
||||
SymbolType::End => self.eof,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, other: Symbol) {
|
||||
let vec = match other.kind {
|
||||
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
|
||||
SymbolType::Terminal => &mut self.terminal_bits,
|
||||
SymbolType::External => &mut self.external_bits,
|
||||
SymbolType::End => {
|
||||
self.eof = true;
|
||||
return;
|
||||
}
|
||||
};
|
||||
if other.index >= vec.len() {
|
||||
vec.resize(other.index + 1, false);
|
||||
}
|
||||
vec.set(other.index, true);
|
||||
}
|
||||
|
||||
pub fn insert_all(&mut self, other: &LookaheadSet) -> bool {
|
||||
let mut result = false;
|
||||
if other.terminal_bits.len() > self.terminal_bits.len() {
|
||||
self.terminal_bits.resize(other.terminal_bits.len(), false);
|
||||
}
|
||||
if other.external_bits.len() > self.external_bits.len() {
|
||||
self.external_bits.resize(other.external_bits.len(), false);
|
||||
}
|
||||
for (i, element) in other.terminal_bits.iter().enumerate() {
|
||||
if element {
|
||||
result |= !self.terminal_bits[i];
|
||||
self.terminal_bits.set(i, element);
|
||||
}
|
||||
}
|
||||
for (i, element) in other.external_bits.iter().enumerate() {
|
||||
if element {
|
||||
result |= !self.external_bits[i];
|
||||
self.external_bits.set(i, element);
|
||||
}
|
||||
}
|
||||
if other.eof {
|
||||
result |= !self.eof;
|
||||
self.eof = true;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ParseItem<'a> {
|
||||
pub fn start() -> Self {
|
||||
ParseItem {
|
||||
variable_index: u32::MAX,
|
||||
production: &START_PRODUCTION,
|
||||
step_index: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn step(&self) -> Option<&'a ProductionStep> {
|
||||
self.production.steps.get(self.step_index as usize)
|
||||
}
|
||||
|
||||
pub fn symbol(&self) -> Option<Symbol> {
|
||||
self.step().map(|step| step.symbol)
|
||||
}
|
||||
|
||||
pub fn associativity(&self) -> Option<Associativity> {
|
||||
self.prev_step().and_then(|step| step.associativity)
|
||||
}
|
||||
|
||||
pub fn precedence(&self) -> i32 {
|
||||
self.prev_step().map_or(0, |step| step.precedence)
|
||||
}
|
||||
|
||||
pub fn prev_step(&self) -> Option<&'a ProductionStep> {
|
||||
if self.step_index > 0 {
|
||||
Some(&self.production.steps[self.step_index as usize - 1])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
self.step_index as usize == self.production.steps.len()
|
||||
}
|
||||
|
||||
pub fn is_augmented(&self) -> bool {
|
||||
self.variable_index == u32::MAX
|
||||
}
|
||||
|
||||
pub fn successor(&self) -> ParseItem<'a> {
|
||||
ParseItem {
|
||||
variable_index: self.variable_index,
|
||||
production: self.production,
|
||||
step_index: self.step_index + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ParseItemSet<'a> {
|
||||
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, LookaheadSet)>) -> Self {
|
||||
let mut result = Self::default();
|
||||
for (item, lookaheads) in elements {
|
||||
result.entries.insert(item, lookaheads);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn hash_unfinished_items(&self, h: &mut impl Hasher) {
|
||||
let mut previous_variable_index = u32::MAX;
|
||||
let mut previous_step_index = u32::MAX;
|
||||
for item in self.entries.keys() {
|
||||
if item.step().is_none() && item.variable_index != previous_variable_index
|
||||
|| item.step_index != previous_step_index
|
||||
{
|
||||
h.write_u32(item.variable_index);
|
||||
h.write_u32(item.step_index);
|
||||
previous_variable_index = item.variable_index;
|
||||
previous_step_index = item.step_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Default for ParseItemSet<'a> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entries: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<'a> fmt::Display for ParseItemDisplay<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
if self.0.is_augmented() {
|
||||
write!(f, "START →")?;
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"{} →",
|
||||
&self.1.variables[self.0.variable_index as usize].name
|
||||
)?;
|
||||
}
|
||||
|
||||
for (i, step) in self.0.production.steps.iter().enumerate() {
|
||||
if i == self.0.step_index as usize {
|
||||
write!(f, " •")?;
|
||||
if step.precedence != 0 || step.associativity.is_some() {
|
||||
write!(
|
||||
f,
|
||||
" (prec {:?} assoc {:?})",
|
||||
step.precedence, step.associativity
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
write!(f, " ")?;
|
||||
if step.symbol.is_terminal() {
|
||||
if let Some(variable) = self.2.variables.get(step.symbol.index) {
|
||||
write!(f, "{}", &variable.name)?;
|
||||
} else {
|
||||
write!(f, "{}-{}", "terminal", step.symbol.index)?;
|
||||
}
|
||||
} else if step.symbol.is_external() {
|
||||
write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
|
||||
} else {
|
||||
write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
|
||||
}
|
||||
|
||||
if let Some(alias) = &step.alias {
|
||||
write!(f, " (alias {})", alias.value)?;
|
||||
}
|
||||
}
|
||||
|
||||
if self.0.is_done() {
|
||||
write!(f, " •")?;
|
||||
if let Some(step) = self.0.production.steps.last() {
|
||||
if step.precedence != 0 || step.associativity.is_some() {
|
||||
write!(
|
||||
f,
|
||||
" (prec {:?} assoc {:?})",
|
||||
step.precedence, step.associativity
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> fmt::Display for LookaheadSetDisplay<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "[")?;
|
||||
for (i, symbol) in self.0.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
|
||||
if symbol.is_terminal() {
|
||||
if let Some(variable) = self.2.variables.get(symbol.index) {
|
||||
write!(f, "{}", &variable.name)?;
|
||||
} else {
|
||||
write!(f, "{}-{}", "terminal", symbol.index)?;
|
||||
}
|
||||
} else if symbol.is_external() {
|
||||
write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
|
||||
} else {
|
||||
write!(f, "{}", &self.1.variables[symbol.index].name)?;
|
||||
}
|
||||
}
|
||||
write!(f, "]")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
for (item, lookaheads) in self.0.entries.iter() {
|
||||
writeln!(
|
||||
f,
|
||||
"{}\t{}",
|
||||
ParseItemDisplay(item, self.1, self.2),
|
||||
LookaheadSetDisplay(lookaheads, self.1, self.2)
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Hash for ParseItem<'a> {
|
||||
fn hash<H: Hasher>(&self, hasher: &mut H) {
|
||||
hasher.write_u32(self.variable_index);
|
||||
hasher.write_u32(self.step_index);
|
||||
hasher.write_i32(self.production.dynamic_precedence);
|
||||
hasher.write_usize(self.production.steps.len());
|
||||
hasher.write_i32(self.precedence());
|
||||
self.associativity().hash(hasher);
|
||||
for step in &self.production.steps[0..self.step_index as usize] {
|
||||
step.alias.hash(hasher);
|
||||
}
|
||||
for step in &self.production.steps[self.step_index as usize..] {
|
||||
step.hash(hasher);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for ParseItem<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
if self.variable_index != other.variable_index
|
||||
|| self.step_index != other.step_index
|
||||
|| self.production.dynamic_precedence != other.production.dynamic_precedence
|
||||
|| self.production.steps.len() != other.production.steps.len()
|
||||
|| self.precedence() != other.precedence()
|
||||
|| self.associativity() != other.associativity()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for (i, step) in self.production.steps.iter().enumerate() {
|
||||
if i < self.step_index as usize {
|
||||
if step.alias != other.production.steps[i].alias {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if *step != other.production.steps[i] {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for ParseItem<'a> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let o = self.variable_index.cmp(&other.variable_index);
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
let o = self.step_index.cmp(&other.step_index);
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
let o = self
|
||||
.production
|
||||
.dynamic_precedence
|
||||
.cmp(&other.production.dynamic_precedence);
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
let o = self
|
||||
.production
|
||||
.steps
|
||||
.len()
|
||||
.cmp(&other.production.steps.len());
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
let o = self.precedence().cmp(&other.precedence());
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
let o = self.associativity().cmp(&other.associativity());
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
for (i, step) in self.production.steps.iter().enumerate() {
|
||||
let o = if i < self.step_index as usize {
|
||||
step.alias.cmp(&other.production.steps[i].alias)
|
||||
} else {
|
||||
step.cmp(&other.production.steps[i])
|
||||
};
|
||||
if o != Ordering::Equal {
|
||||
return o;
|
||||
}
|
||||
}
|
||||
return Ordering::Equal;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialOrd for ParseItem<'a> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Eq for ParseItem<'a> {}
|
||||
|
||||
impl<'a> Hash for ParseItemSet<'a> {
|
||||
fn hash<H: Hasher>(&self, hasher: &mut H) {
|
||||
hasher.write_usize(self.entries.len());
|
||||
for (item, lookaheads) in self.entries.iter() {
|
||||
item.hash(hasher);
|
||||
lookaheads.hash(hasher);
|
||||
}
|
||||
}
|
||||
}
|
||||
330
cli/src/build_tables/item_set_builder.rs
Normal file
330
cli/src/build_tables/item_set_builder.rs
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet};
|
||||
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
|
||||
use crate::rules::Symbol;
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
struct TransitiveClosureAddition<'a> {
|
||||
item: ParseItem<'a>,
|
||||
info: FollowSetInfo,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
struct FollowSetInfo {
|
||||
lookaheads: LookaheadSet,
|
||||
propagates_lookaheads: bool,
|
||||
}
|
||||
|
||||
pub(crate) struct ParseItemSetBuilder<'a> {
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
first_sets: HashMap<Symbol, LookaheadSet>,
|
||||
last_sets: HashMap<Symbol, LookaheadSet>,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
|
||||
}
|
||||
|
||||
fn find_or_push<T: Eq>(vector: &mut Vec<T>, value: T) {
|
||||
if !vector.contains(&value) {
|
||||
vector.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ParseItemSetBuilder<'a> {
|
||||
pub fn new(
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
) -> Self {
|
||||
let mut result = Self {
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
first_sets: HashMap::new(),
|
||||
last_sets: HashMap::new(),
|
||||
inlines,
|
||||
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
|
||||
};
|
||||
|
||||
// For each grammar symbol, populate the FIRST and LAST sets: the set of
|
||||
// terminals that appear at the beginning and end that symbol's productions,
|
||||
// respectively.
|
||||
//
|
||||
// For a terminal symbol, the FIRST and LAST set just consists of the
|
||||
// terminal itself.
|
||||
for i in 0..lexical_grammar.variables.len() {
|
||||
let symbol = Symbol::terminal(i);
|
||||
let mut set = LookaheadSet::new();
|
||||
set.insert(symbol);
|
||||
result.first_sets.insert(symbol, set.clone());
|
||||
result.last_sets.insert(symbol, set);
|
||||
}
|
||||
|
||||
for i in 0..syntax_grammar.external_tokens.len() {
|
||||
let symbol = Symbol::external(i);
|
||||
let mut set = LookaheadSet::new();
|
||||
set.insert(symbol);
|
||||
result.first_sets.insert(symbol, set.clone());
|
||||
result.last_sets.insert(symbol, set);
|
||||
}
|
||||
|
||||
// The FIRST set of a non-terminal `i` is the union of the following sets:
|
||||
// * the set of all terminals that appear at the beginings of i's productions
|
||||
// * the FIRST sets of all the non-terminals that appear at the beginnings
|
||||
// of i's productions
|
||||
//
|
||||
// Rather than computing these sets using recursion, we use an explicit stack
|
||||
// called `symbols_to_process`.
|
||||
let mut symbols_to_process = Vec::new();
|
||||
let mut processed_non_terminals = HashSet::new();
|
||||
for i in 0..syntax_grammar.variables.len() {
|
||||
let symbol = Symbol::non_terminal(i);
|
||||
|
||||
let first_set = &mut result
|
||||
.first_sets
|
||||
.entry(symbol)
|
||||
.or_insert(LookaheadSet::new());
|
||||
processed_non_terminals.clear();
|
||||
symbols_to_process.clear();
|
||||
symbols_to_process.push(symbol);
|
||||
while let Some(current_symbol) = symbols_to_process.pop() {
|
||||
if current_symbol.is_terminal() || current_symbol.is_external() {
|
||||
first_set.insert(current_symbol);
|
||||
} else if processed_non_terminals.insert(current_symbol) {
|
||||
for production in syntax_grammar.variables[current_symbol.index]
|
||||
.productions
|
||||
.iter()
|
||||
{
|
||||
if let Some(step) = production.steps.first() {
|
||||
symbols_to_process.push(step.symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The LAST set is defined in a similar way to the FIRST set.
|
||||
let last_set = &mut result
|
||||
.last_sets
|
||||
.entry(symbol)
|
||||
.or_insert(LookaheadSet::new());
|
||||
processed_non_terminals.clear();
|
||||
symbols_to_process.clear();
|
||||
symbols_to_process.push(symbol);
|
||||
while let Some(current_symbol) = symbols_to_process.pop() {
|
||||
if current_symbol.is_terminal() || current_symbol.is_external() {
|
||||
last_set.insert(current_symbol);
|
||||
} else if processed_non_terminals.insert(current_symbol) {
|
||||
for production in syntax_grammar.variables[current_symbol.index]
|
||||
.productions
|
||||
.iter()
|
||||
{
|
||||
if let Some(step) = production.steps.last() {
|
||||
symbols_to_process.push(step.symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// To compute an item set's transitive closure, we find each item in the set
|
||||
// whose next symbol is a non-terminal, and we add new items to the set for
|
||||
// each of that symbols' productions. These productions might themselves begin
|
||||
// with non-terminals, so the process continues recursively. In this process,
|
||||
// the total set of entries that get added depends only on two things:
|
||||
// * the set of non-terminal symbols that occur at each item's current position
|
||||
// * the set of terminals that occurs after each of these non-terminal symbols
|
||||
//
|
||||
// So we can avoid a lot of duplicated recursive work by precomputing, for each
|
||||
// non-terminal symbol `i`, a final list of *additions* that must be made to an
|
||||
// item set when `i` occurs as the next symbol in one if its core items. The
|
||||
// structure of an *addition* is as follows:
|
||||
// * `item` - the new item that must be added as part of the expansion of `i`
|
||||
// * `lookaheads` - lookahead tokens that can always come after that item in
|
||||
// the expansion of `i`
|
||||
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can
|
||||
// occur at the *end* of the expansion of `i`, so that i's own current
|
||||
// lookahead tokens can occur after `item`.
|
||||
//
|
||||
// Again, rather than computing these additions recursively, we use an explicit
|
||||
// stack called `entries_to_process`.
|
||||
for i in 0..syntax_grammar.variables.len() {
|
||||
let empty_lookaheads = LookaheadSet::new();
|
||||
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
|
||||
|
||||
// First, build up a map whose keys are all of the non-terminals that can
|
||||
// appear at the beginning of non-terminal `i`, and whose values store
|
||||
// information about the tokens that can follow each non-terminal.
|
||||
let mut follow_set_info_by_non_terminal = HashMap::new();
|
||||
while let Some(entry) = entries_to_process.pop() {
|
||||
let (variable_index, lookaheads, propagates_lookaheads) = entry;
|
||||
let existing_info = follow_set_info_by_non_terminal
|
||||
.entry(variable_index)
|
||||
.or_insert_with(|| FollowSetInfo {
|
||||
lookaheads: LookaheadSet::new(),
|
||||
propagates_lookaheads: false,
|
||||
});
|
||||
|
||||
let did_add_follow_set_info;
|
||||
if propagates_lookaheads {
|
||||
did_add_follow_set_info = !existing_info.propagates_lookaheads;
|
||||
existing_info.propagates_lookaheads = true;
|
||||
} else {
|
||||
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
|
||||
}
|
||||
|
||||
if did_add_follow_set_info {
|
||||
for production in &syntax_grammar.variables[variable_index].productions {
|
||||
if let Some(symbol) = production.first_symbol() {
|
||||
if symbol.is_non_terminal() {
|
||||
if production.steps.len() == 1 {
|
||||
entries_to_process.push((
|
||||
symbol.index,
|
||||
lookaheads,
|
||||
propagates_lookaheads,
|
||||
));
|
||||
} else {
|
||||
entries_to_process.push((
|
||||
symbol.index,
|
||||
&result.first_sets[&production.steps[1].symbol],
|
||||
false,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store all of those non-terminals' productions, along with their associated
|
||||
// lookahead info, as *additions* associated with non-terminal `i`.
|
||||
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
|
||||
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
|
||||
let variable = &syntax_grammar.variables[variable_index];
|
||||
let non_terminal = Symbol::non_terminal(variable_index);
|
||||
let variable_index = variable_index as u32;
|
||||
if syntax_grammar.variables_to_inline.contains(&non_terminal) {
|
||||
continue;
|
||||
}
|
||||
for production in &variable.productions {
|
||||
let item = ParseItem {
|
||||
variable_index,
|
||||
production,
|
||||
step_index: 0,
|
||||
};
|
||||
|
||||
if let Some(inlined_productions) =
|
||||
inlines.inlined_productions(item.production, item.step_index)
|
||||
{
|
||||
for production in inlined_productions {
|
||||
find_or_push(
|
||||
additions_for_non_terminal,
|
||||
TransitiveClosureAddition {
|
||||
item: ParseItem {
|
||||
variable_index,
|
||||
production,
|
||||
step_index: item.step_index,
|
||||
},
|
||||
info: follow_set_info.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
} else {
|
||||
find_or_push(
|
||||
additions_for_non_terminal,
|
||||
TransitiveClosureAddition {
|
||||
item,
|
||||
info: follow_set_info.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
|
||||
let mut result = ParseItemSet::default();
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
if let Some(productions) = self
|
||||
.inlines
|
||||
.inlined_productions(item.production, item.step_index)
|
||||
{
|
||||
for production in productions {
|
||||
self.add_item(
|
||||
&mut result,
|
||||
ParseItem {
|
||||
variable_index: item.variable_index,
|
||||
production,
|
||||
step_index: item.step_index,
|
||||
},
|
||||
lookaheads,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
self.add_item(&mut result, *item, lookaheads);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet {
|
||||
&self.first_sets[symbol]
|
||||
}
|
||||
|
||||
pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet {
|
||||
&self.first_sets[symbol]
|
||||
}
|
||||
|
||||
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) {
|
||||
if let Some(step) = item.step() {
|
||||
if step.symbol.is_non_terminal() {
|
||||
let next_step = item.successor().step();
|
||||
|
||||
// Determine which tokens can follow this non-terminal.
|
||||
let following_tokens = if let Some(next_step) = next_step {
|
||||
self.first_sets.get(&next_step.symbol).unwrap()
|
||||
} else {
|
||||
&lookaheads
|
||||
};
|
||||
|
||||
// Use the pre-computed *additions* to expand the non-terminal.
|
||||
for addition in &self.transitive_closure_additions[step.symbol.index] {
|
||||
let lookaheads = set
|
||||
.entries
|
||||
.entry(addition.item)
|
||||
.or_insert_with(|| LookaheadSet::new());
|
||||
lookaheads.insert_all(&addition.info.lookaheads);
|
||||
if addition.info.propagates_lookaheads {
|
||||
lookaheads.insert_all(following_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
set.entries.insert(item, lookaheads.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "ParseItemSetBuilder {{\n")?;
|
||||
|
||||
write!(f, " additions: {{\n")?;
|
||||
for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
|
||||
write!(f, " {}: {{\n", variable.name)?;
|
||||
for addition in &self.transitive_closure_additions[i] {
|
||||
write!(
|
||||
f,
|
||||
" {}\n",
|
||||
ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar)
|
||||
)?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
}
|
||||
write!(f, " }},")?;
|
||||
|
||||
write!(f, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
281
cli/src/build_tables/minimize_parse_table.rs
Normal file
281
cli/src/build_tables/minimize_parse_table.rs
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
use super::item::LookaheadSet;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::grammars::{SyntaxGrammar, VariableType};
|
||||
use crate::rules::{AliasMap, Symbol};
|
||||
use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
|
||||
pub(crate) fn minimize_parse_table(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
simple_aliases: &AliasMap,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
keywords: &LookaheadSet,
|
||||
) {
|
||||
let mut minimizer = Minimizer {
|
||||
parse_table,
|
||||
syntax_grammar,
|
||||
token_conflict_map,
|
||||
keywords,
|
||||
simple_aliases,
|
||||
};
|
||||
minimizer.remove_unit_reductions();
|
||||
minimizer.merge_compatible_states();
|
||||
minimizer.remove_unused_states();
|
||||
}
|
||||
|
||||
struct Minimizer<'a> {
|
||||
parse_table: &'a mut ParseTable,
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
token_conflict_map: &'a TokenConflictMap<'a>,
|
||||
keywords: &'a LookaheadSet,
|
||||
simple_aliases: &'a AliasMap,
|
||||
}
|
||||
|
||||
impl<'a> Minimizer<'a> {
|
||||
fn remove_unit_reductions(&mut self) {
|
||||
let mut aliased_symbols = HashSet::new();
|
||||
for variable in &self.syntax_grammar.variables {
|
||||
for production in &variable.productions {
|
||||
for step in &production.steps {
|
||||
if step.alias.is_some() {
|
||||
aliased_symbols.insert(step.symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut unit_reduction_symbols_by_state = HashMap::new();
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
let mut only_unit_reductions = true;
|
||||
let mut unit_reduction_symbol = None;
|
||||
for (_, entry) in &state.terminal_entries {
|
||||
for action in &entry.actions {
|
||||
match action {
|
||||
ParseAction::ShiftExtra => continue,
|
||||
ParseAction::Reduce {
|
||||
child_count: 1,
|
||||
alias_sequence_id: 0,
|
||||
symbol,
|
||||
..
|
||||
} => {
|
||||
if !self.simple_aliases.contains_key(&symbol)
|
||||
&& !aliased_symbols.contains(&symbol)
|
||||
&& self.syntax_grammar.variables[symbol.index].kind
|
||||
!= VariableType::Named
|
||||
&& (unit_reduction_symbol.is_none()
|
||||
|| unit_reduction_symbol == Some(symbol))
|
||||
{
|
||||
unit_reduction_symbol = Some(symbol);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
only_unit_reductions = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if !only_unit_reductions {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(symbol) = unit_reduction_symbol {
|
||||
if only_unit_reductions {
|
||||
unit_reduction_symbols_by_state.insert(i, *symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for state in self.parse_table.states.iter_mut() {
|
||||
let mut done = false;
|
||||
while !done {
|
||||
done = true;
|
||||
state.update_referenced_states(|other_state_id, state| {
|
||||
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
|
||||
done = false;
|
||||
state.nonterminal_entries[symbol]
|
||||
} else {
|
||||
other_state_id
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_compatible_states(&mut self) {
|
||||
let mut state_ids_by_signature = HashMap::new();
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
state_ids_by_signature
|
||||
.entry(state.unfinished_item_signature)
|
||||
.or_insert(Vec::new())
|
||||
.push(i);
|
||||
}
|
||||
|
||||
let mut deleted_states = HashSet::new();
|
||||
loop {
|
||||
let mut state_replacements = HashMap::new();
|
||||
for (_, state_ids) in &state_ids_by_signature {
|
||||
for i in state_ids {
|
||||
for j in state_ids {
|
||||
if j == i {
|
||||
break;
|
||||
}
|
||||
if deleted_states.contains(j) || deleted_states.contains(i) {
|
||||
continue;
|
||||
}
|
||||
if self.merge_parse_state(*j, *i) {
|
||||
deleted_states.insert(*i);
|
||||
state_replacements.insert(*i, *j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if state_replacements.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
for state in self.parse_table.states.iter_mut() {
|
||||
state.update_referenced_states(|other_state_id, _| {
|
||||
*state_replacements
|
||||
.get(&other_state_id)
|
||||
.unwrap_or(&other_state_id)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
|
||||
let left_state = &self.parse_table.states[left];
|
||||
let right_state = &self.parse_table.states[right];
|
||||
|
||||
if left_state.nonterminal_entries != right_state.nonterminal_entries {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (symbol, left_entry) in &left_state.terminal_entries {
|
||||
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
|
||||
if right_entry.actions != left_entry.actions {
|
||||
return false;
|
||||
}
|
||||
} else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
let mut symbols_to_add = Vec::new();
|
||||
for (symbol, right_entry) in &right_state.terminal_entries {
|
||||
if !left_state.terminal_entries.contains_key(&symbol) {
|
||||
if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
|
||||
return false;
|
||||
}
|
||||
symbols_to_add.push(*symbol);
|
||||
}
|
||||
}
|
||||
|
||||
for symbol in symbols_to_add {
|
||||
let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
|
||||
self.parse_table.states[left]
|
||||
.terminal_entries
|
||||
.insert(symbol, entry);
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn can_add_entry_to_state(
|
||||
&self,
|
||||
state: &ParseState,
|
||||
token: Symbol,
|
||||
entry: &ParseTableEntry,
|
||||
) -> bool {
|
||||
// Do not add external tokens; they could conflict lexically with any of the state's
|
||||
// existing lookahead tokens.
|
||||
if token.is_external() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only merge_compatible_states parse states by allowing existing reductions to happen
|
||||
// with additional lookahead tokens. Do not alter parse states in ways
|
||||
// that allow entirely new types of actions to happen.
|
||||
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
|
||||
return false;
|
||||
}
|
||||
match entry.actions.last() {
|
||||
Some(ParseAction::Reduce { .. }) => {}
|
||||
_ => return false,
|
||||
}
|
||||
|
||||
// Do not add tokens which are both internal and external. Their validity could
|
||||
// influence the behavior of the external scanner.
|
||||
if self
|
||||
.syntax_grammar
|
||||
.external_tokens
|
||||
.iter()
|
||||
.any(|t| t.corresponding_internal_token == Some(token))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
let is_word_token = self.syntax_grammar.word_token == Some(token);
|
||||
let is_keyword = self.keywords.contains(&token);
|
||||
|
||||
// Do not add a token if it conflicts with an existing token.
|
||||
if token.is_terminal() {
|
||||
for existing_token in state.terminal_entries.keys() {
|
||||
if (is_word_token && self.keywords.contains(existing_token))
|
||||
|| is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if self
|
||||
.token_conflict_map
|
||||
.does_conflict(token.index, existing_token.index)
|
||||
|| self
|
||||
.token_conflict_map
|
||||
.does_match_same_string(token.index, existing_token.index)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn remove_unused_states(&mut self) {
|
||||
let mut state_usage_map = vec![false; self.parse_table.states.len()];
|
||||
|
||||
state_usage_map[0] = true;
|
||||
state_usage_map[1] = true;
|
||||
|
||||
for state in &self.parse_table.states {
|
||||
for referenced_state in state.referenced_states() {
|
||||
state_usage_map[referenced_state] = true;
|
||||
}
|
||||
}
|
||||
let mut removed_predecessor_count = 0;
|
||||
let mut state_replacement_map = vec![0; self.parse_table.states.len()];
|
||||
for state_id in 0..self.parse_table.states.len() {
|
||||
state_replacement_map[state_id] = state_id - removed_predecessor_count;
|
||||
if !state_usage_map[state_id] {
|
||||
removed_predecessor_count += 1;
|
||||
}
|
||||
}
|
||||
let mut state_id = 0;
|
||||
let mut original_state_id = 0;
|
||||
while state_id < self.parse_table.states.len() {
|
||||
if state_usage_map[original_state_id] {
|
||||
self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
|
||||
state_replacement_map[other_state_id]
|
||||
});
|
||||
state_id += 1;
|
||||
} else {
|
||||
self.parse_table.states.remove(state_id);
|
||||
}
|
||||
original_state_id += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
285
cli/src/build_tables/mod.rs
Normal file
285
cli/src/build_tables/mod.rs
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
mod build_lex_table;
|
||||
mod build_parse_table;
|
||||
mod coincident_tokens;
|
||||
mod item;
|
||||
mod item_set_builder;
|
||||
mod minimize_parse_table;
|
||||
mod token_conflicts;
|
||||
|
||||
use self::build_lex_table::build_lex_table;
|
||||
use self::build_parse_table::build_parse_table;
|
||||
use self::coincident_tokens::CoincidentTokenIndex;
|
||||
use self::item::LookaheadSet;
|
||||
use self::minimize_parse_table::minimize_parse_table;
|
||||
use self::token_conflicts::TokenConflictMap;
|
||||
use crate::error::Result;
|
||||
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
|
||||
use crate::nfa::{CharacterSet, NfaCursor};
|
||||
use crate::rules::{AliasMap, Symbol};
|
||||
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
|
||||
|
||||
pub(crate) fn build_tables(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
simple_aliases: &AliasMap,
|
||||
inlines: &InlinedProductionMap,
|
||||
minimize: bool,
|
||||
state_ids_to_log: Vec<usize>,
|
||||
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
|
||||
let (mut parse_table, following_tokens) =
|
||||
build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?;
|
||||
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
|
||||
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
|
||||
let keywords = identify_keywords(
|
||||
lexical_grammar,
|
||||
&parse_table,
|
||||
syntax_grammar.word_token,
|
||||
&token_conflict_map,
|
||||
&coincident_token_index,
|
||||
);
|
||||
populate_error_state(
|
||||
&mut parse_table,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
&coincident_token_index,
|
||||
&token_conflict_map,
|
||||
);
|
||||
mark_fragile_tokens(
|
||||
&mut parse_table,
|
||||
lexical_grammar,
|
||||
&token_conflict_map,
|
||||
);
|
||||
if minimize {
|
||||
minimize_parse_table(
|
||||
&mut parse_table,
|
||||
syntax_grammar,
|
||||
simple_aliases,
|
||||
&token_conflict_map,
|
||||
&keywords,
|
||||
);
|
||||
}
|
||||
let (main_lex_table, keyword_lex_table) = build_lex_table(
|
||||
&mut parse_table,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
&keywords,
|
||||
minimize,
|
||||
);
|
||||
Ok((
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
syntax_grammar.word_token,
|
||||
))
|
||||
}
|
||||
|
||||
fn populate_error_state(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
coincident_token_index: &CoincidentTokenIndex,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
) {
|
||||
let state = &mut parse_table.states[0];
|
||||
let n = lexical_grammar.variables.len();
|
||||
|
||||
// First identify the *conflict-free tokens*: tokens that do not overlap with
|
||||
// any other token in any way.
|
||||
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
|
||||
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
|
||||
j != i
|
||||
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
|
||||
&& token_conflict_map.does_conflict(i, j)
|
||||
});
|
||||
if conflicts_with_other_tokens {
|
||||
None
|
||||
} else {
|
||||
info!(
|
||||
"error recovery - token {} has no conflicts",
|
||||
lexical_grammar.variables[i].name
|
||||
);
|
||||
Some(Symbol::terminal(i))
|
||||
}
|
||||
}));
|
||||
|
||||
let recover_entry = ParseTableEntry {
|
||||
reusable: false,
|
||||
actions: vec![ParseAction::Recover],
|
||||
};
|
||||
|
||||
// Exclude from the error-recovery state any token that conflicts with one of
|
||||
// the *conflict-free tokens* identified above.
|
||||
for i in 0..n {
|
||||
let symbol = Symbol::terminal(i);
|
||||
if !conflict_free_tokens.contains(&symbol) {
|
||||
if syntax_grammar.word_token != Some(symbol) {
|
||||
if let Some(t) = conflict_free_tokens.iter().find(|t| {
|
||||
!coincident_token_index.contains(symbol, *t)
|
||||
&& token_conflict_map.does_conflict(symbol.index, t.index)
|
||||
}) {
|
||||
info!(
|
||||
"error recovery - exclude token {} because of conflict with {}",
|
||||
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"error recovery - include token {}",
|
||||
lexical_grammar.variables[i].name
|
||||
);
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(symbol)
|
||||
.or_insert_with(|| recover_entry.clone());
|
||||
}
|
||||
|
||||
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
|
||||
if external_token.corresponding_internal_token.is_none() {
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(Symbol::external(i))
|
||||
.or_insert_with(|| recover_entry.clone());
|
||||
}
|
||||
}
|
||||
|
||||
state.terminal_entries.insert(Symbol::end(), recover_entry);
|
||||
}
|
||||
|
||||
fn identify_keywords(
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
parse_table: &ParseTable,
|
||||
word_token: Option<Symbol>,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
coincident_token_index: &CoincidentTokenIndex,
|
||||
) -> LookaheadSet {
|
||||
if word_token.is_none() {
|
||||
return LookaheadSet::new();
|
||||
}
|
||||
|
||||
let word_token = word_token.unwrap();
|
||||
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
|
||||
|
||||
// First find all of the candidate keyword tokens: tokens that start with
|
||||
// letters or underscore and can match the same string as a word token.
|
||||
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|
||||
|(i, variable)| {
|
||||
cursor.reset(vec![variable.start_state]);
|
||||
if all_chars_are_alphabetical(&cursor)
|
||||
&& token_conflict_map.does_match_same_string(i, word_token.index)
|
||||
{
|
||||
info!(
|
||||
"Keywords - add candidate {}",
|
||||
lexical_grammar.variables[i].name
|
||||
);
|
||||
Some(Symbol::terminal(i))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
));
|
||||
|
||||
// Exclude keyword candidates that shadow another keyword candidate.
|
||||
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
|
||||
for other_token in keywords.iter() {
|
||||
if other_token != *token
|
||||
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
|
||||
{
|
||||
info!(
|
||||
"Keywords - exclude {} because it matches the same string as {}",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
lexical_grammar.variables[other_token.index].name
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}));
|
||||
|
||||
// Exclude keyword candidates for which substituting the keyword capture
|
||||
// token would introduce new lexical conflicts with other tokens.
|
||||
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
|
||||
for other_index in 0..lexical_grammar.variables.len() {
|
||||
if keywords.contains(&Symbol::terminal(other_index)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the word token was already valid in every state containing
|
||||
// this keyword candidate, then substituting the word token won't
|
||||
// introduce any new lexical conflicts.
|
||||
if coincident_token_index
|
||||
.states_with(*token, Symbol::terminal(other_index))
|
||||
.iter()
|
||||
.all(|state_id| {
|
||||
parse_table.states[*state_id]
|
||||
.terminal_entries
|
||||
.contains_key(&word_token)
|
||||
})
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if !token_conflict_map.has_same_conflict_status(
|
||||
token.index,
|
||||
word_token.index,
|
||||
other_index,
|
||||
) {
|
||||
info!(
|
||||
"Keywords - exclude {} because of conflict with {}",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
lexical_grammar.variables[other_index].name
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Keywords - include {}",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
);
|
||||
true
|
||||
}));
|
||||
|
||||
keywords
|
||||
}
|
||||
|
||||
fn mark_fragile_tokens(
|
||||
parse_table: &mut ParseTable,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
) {
|
||||
let n = lexical_grammar.variables.len();
|
||||
let mut valid_tokens_mask = Vec::with_capacity(n);
|
||||
for state in parse_table.states.iter_mut() {
|
||||
valid_tokens_mask.clear();
|
||||
valid_tokens_mask.resize(n, false);
|
||||
for token in state.terminal_entries.keys() {
|
||||
if token.is_terminal() {
|
||||
valid_tokens_mask[token.index] = true;
|
||||
}
|
||||
}
|
||||
for (token, entry) in state.terminal_entries.iter_mut() {
|
||||
for i in 0..n {
|
||||
if token_conflict_map.does_overlap(i, token.index) {
|
||||
if valid_tokens_mask[i] {
|
||||
entry.reusable = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
|
||||
cursor.transition_chars().all(|(chars, is_sep)| {
|
||||
if is_sep {
|
||||
true
|
||||
} else if let CharacterSet::Include(chars) = chars {
|
||||
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
}
|
||||
382
cli/src/build_tables/token_conflicts.rs
Normal file
382
cli/src/build_tables/token_conflicts.rs
Normal file
|
|
@ -0,0 +1,382 @@
|
|||
use crate::build_tables::item::LookaheadSet;
|
||||
use crate::grammars::LexicalGrammar;
|
||||
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
|
||||
use hashbrown::HashSet;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
struct TokenConflictStatus {
|
||||
does_overlap: bool,
|
||||
does_match_valid_continuation: bool,
|
||||
does_match_separators: bool,
|
||||
matches_same_string: bool,
|
||||
}
|
||||
|
||||
pub(crate) struct TokenConflictMap<'a> {
|
||||
n: usize,
|
||||
status_matrix: Vec<TokenConflictStatus>,
|
||||
starting_chars_by_index: Vec<CharacterSet>,
|
||||
following_chars_by_index: Vec<CharacterSet>,
|
||||
grammar: &'a LexicalGrammar,
|
||||
}
|
||||
|
||||
impl<'a> TokenConflictMap<'a> {
|
||||
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
|
||||
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
|
||||
let starting_chars = get_starting_chars(&mut cursor, grammar);
|
||||
let following_chars = get_following_chars(&starting_chars, following_tokens);
|
||||
|
||||
let n = grammar.variables.len();
|
||||
let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
|
||||
for i in 0..grammar.variables.len() {
|
||||
for j in 0..i {
|
||||
let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
|
||||
status_matrix[matrix_index(n, i, j)] = status.0;
|
||||
status_matrix[matrix_index(n, j, i)] = status.1;
|
||||
}
|
||||
}
|
||||
|
||||
TokenConflictMap {
|
||||
n,
|
||||
status_matrix,
|
||||
starting_chars_by_index: starting_chars,
|
||||
following_chars_by_index: following_chars,
|
||||
grammar,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
|
||||
let left = &self.status_matrix[matrix_index(self.n, a, other)];
|
||||
let right = &self.status_matrix[matrix_index(self.n, b, other)];
|
||||
left == right
|
||||
}
|
||||
|
||||
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
|
||||
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
|
||||
}
|
||||
|
||||
pub fn does_conflict(&self, i: usize, j: usize) -> bool {
|
||||
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
|
||||
entry.does_match_valid_continuation || entry.does_match_separators
|
||||
}
|
||||
|
||||
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
|
||||
self.status_matrix[matrix_index(self.n, i, j)].does_overlap
|
||||
}
|
||||
|
||||
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
|
||||
if left.0 > right.0 {
|
||||
return true;
|
||||
} else if left.0 < right.0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
match grammar.variables[left.1]
|
||||
.implicit_precedence
|
||||
.cmp(&grammar.variables[right.1].implicit_precedence)
|
||||
{
|
||||
Ordering::Less => false,
|
||||
Ordering::Greater => true,
|
||||
Ordering::Equal => left.1 < right.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> fmt::Debug for TokenConflictMap<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TokenConflictMap {{\n")?;
|
||||
|
||||
write!(f, " starting_characters: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
|
||||
write!(f, " following_characters: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(
|
||||
f,
|
||||
" {}: {:?},\n",
|
||||
self.grammar.variables[i].name, self.following_chars_by_index[i]
|
||||
)?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
|
||||
write!(f, " status_matrix: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
|
||||
for j in 0..self.n {
|
||||
write!(
|
||||
f,
|
||||
" {}: {:?},\n",
|
||||
self.grammar.variables[j].name,
|
||||
self.status_matrix[matrix_index(self.n, i, j)]
|
||||
)?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
}
|
||||
write!(f, " }},")?;
|
||||
write!(f, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
|
||||
variable_count * i + j
|
||||
}
|
||||
|
||||
fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
|
||||
let mut result = Vec::with_capacity(grammar.variables.len());
|
||||
for variable in &grammar.variables {
|
||||
cursor.reset(vec![variable.start_state]);
|
||||
let mut all_chars = CharacterSet::empty();
|
||||
for (chars, _) in cursor.transition_chars() {
|
||||
all_chars = all_chars.add(chars);
|
||||
}
|
||||
result.push(all_chars);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn get_following_chars(
|
||||
starting_chars: &Vec<CharacterSet>,
|
||||
following_tokens: Vec<LookaheadSet>,
|
||||
) -> Vec<CharacterSet> {
|
||||
following_tokens
|
||||
.into_iter()
|
||||
.map(|following_tokens| {
|
||||
let mut chars = CharacterSet::empty();
|
||||
for token in following_tokens.iter() {
|
||||
if token.is_terminal() {
|
||||
chars = chars.add(&starting_chars[token.index]);
|
||||
}
|
||||
}
|
||||
chars
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn compute_conflict_status(
|
||||
cursor: &mut NfaCursor,
|
||||
grammar: &LexicalGrammar,
|
||||
following_chars: &Vec<CharacterSet>,
|
||||
i: usize,
|
||||
j: usize,
|
||||
) -> (TokenConflictStatus, TokenConflictStatus) {
|
||||
let mut visited_state_sets = HashSet::new();
|
||||
let mut state_set_queue = vec![vec![
|
||||
grammar.variables[i].start_state,
|
||||
grammar.variables[j].start_state,
|
||||
]];
|
||||
let mut result = (
|
||||
TokenConflictStatus::default(),
|
||||
TokenConflictStatus::default(),
|
||||
);
|
||||
|
||||
while let Some(state_set) = state_set_queue.pop() {
|
||||
// Don't pursue states where there's no potential for conflict.
|
||||
if variable_ids_for_states(&state_set, grammar).count() > 1 {
|
||||
cursor.reset(state_set);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut completion = None;
|
||||
for (id, precedence) in cursor.completions() {
|
||||
if let Some((prev_id, prev_precedence)) = completion {
|
||||
if id == prev_id {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prefer tokens with higher precedence. For tokens with equal precedence,
|
||||
// prefer those listed earlier in the grammar.
|
||||
let winning_id;
|
||||
if TokenConflictMap::prefer_token(
|
||||
grammar,
|
||||
(prev_precedence, prev_id),
|
||||
(precedence, id),
|
||||
) {
|
||||
winning_id = prev_id;
|
||||
} else {
|
||||
winning_id = id;
|
||||
completion = Some((id, precedence));
|
||||
}
|
||||
|
||||
if winning_id == i {
|
||||
result.0.matches_same_string = true;
|
||||
result.0.does_overlap = true;
|
||||
} else {
|
||||
result.1.matches_same_string = true;
|
||||
result.1.does_overlap = true;
|
||||
}
|
||||
} else {
|
||||
completion = Some((id, precedence));
|
||||
}
|
||||
}
|
||||
|
||||
for NfaTransition {
|
||||
characters,
|
||||
precedence,
|
||||
states,
|
||||
is_separator,
|
||||
} in cursor.transitions()
|
||||
{
|
||||
let mut can_advance = true;
|
||||
if let Some((completed_id, completed_precedence)) = completion {
|
||||
let mut other_id = None;
|
||||
let mut successor_contains_completed_id = false;
|
||||
for variable_id in variable_ids_for_states(&states, grammar) {
|
||||
if variable_id == completed_id {
|
||||
successor_contains_completed_id = true;
|
||||
break;
|
||||
} else {
|
||||
other_id = Some(variable_id);
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
|
||||
let winning_id;
|
||||
if precedence < completed_precedence {
|
||||
winning_id = completed_id;
|
||||
can_advance = false;
|
||||
} else {
|
||||
winning_id = other_id;
|
||||
}
|
||||
|
||||
if winning_id == i {
|
||||
result.0.does_overlap = true;
|
||||
if characters.does_intersect(&following_chars[j]) {
|
||||
result.0.does_match_valid_continuation = true;
|
||||
}
|
||||
if is_separator {
|
||||
result.0.does_match_separators = true;
|
||||
}
|
||||
} else {
|
||||
result.1.does_overlap = true;
|
||||
if characters.does_intersect(&following_chars[i]) {
|
||||
result.1.does_match_valid_continuation = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if can_advance && visited_state_sets.insert(states.clone()) {
|
||||
state_set_queue.push(states);
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn variable_ids_for_states<'a>(
|
||||
state_ids: &'a Vec<u32>,
|
||||
grammar: &'a LexicalGrammar,
|
||||
) -> impl Iterator<Item = usize> + 'a {
|
||||
let mut prev = None;
|
||||
state_ids.iter().filter_map(move |state_id| {
|
||||
let variable_id = grammar.variable_index_for_nfa_state(*state_id);
|
||||
if prev != Some(variable_id) {
|
||||
prev = Some(variable_id);
|
||||
prev
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::grammars::{Variable, VariableType};
|
||||
use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
|
||||
use crate::rules::{Rule, Symbol};
|
||||
|
||||
#[test]
|
||||
fn test_starting_characters() {
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: Vec::new(),
|
||||
variables: vec![
|
||||
Variable {
|
||||
name: "token_0".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("[a-f]1|0x\\d"),
|
||||
},
|
||||
Variable {
|
||||
name: "token_1".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("d*ef"),
|
||||
},
|
||||
],
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let token_map = TokenConflictMap::new(&grammar, Vec::new());
|
||||
|
||||
assert_eq!(
|
||||
token_map.starting_chars_by_index[0],
|
||||
CharacterSet::empty().add_range('a', 'f').add_char('0')
|
||||
);
|
||||
assert_eq!(
|
||||
token_map.starting_chars_by_index[1],
|
||||
CharacterSet::empty().add_range('d', 'e')
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_token_conflicts() {
|
||||
let grammar = expand_tokens(ExtractedLexicalGrammar {
|
||||
separators: Vec::new(),
|
||||
variables: vec![
|
||||
Variable {
|
||||
name: "in".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::string("in"),
|
||||
},
|
||||
Variable {
|
||||
name: "identifier".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::pattern("\\w+"),
|
||||
},
|
||||
Variable {
|
||||
name: "instanceof".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::string("instanceof"),
|
||||
},
|
||||
],
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let var = |name| index_of_var(&grammar, name);
|
||||
|
||||
let token_map = TokenConflictMap::new(
|
||||
&grammar,
|
||||
vec![
|
||||
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
|
||||
LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
|
||||
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
|
||||
],
|
||||
);
|
||||
|
||||
// Given the string "in", the `in` token is preferred over the `identifier` token
|
||||
assert!(token_map.does_match_same_string(var("in"), var("identifier")));
|
||||
assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
|
||||
|
||||
// Depending on what character follows, the string "in" may be treated as part of an
|
||||
// `identifier` token.
|
||||
assert!(token_map.does_conflict(var("identifier"), var("in")));
|
||||
|
||||
// Depending on what character follows, the string "instanceof" may be treated as part of
|
||||
// an `identifier` token.
|
||||
assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
|
||||
assert!(token_map.does_conflict(var("instanceof"), var("in")));
|
||||
}
|
||||
|
||||
fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
|
||||
grammar
|
||||
.variables
|
||||
.iter()
|
||||
.position(|v| v.name == name)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue