Move code into cli directory

This commit is contained in:
Max Brunsfeld 2019-01-04 16:50:52 -08:00
parent b8dd5d2640
commit 5b0e12ea33
29 changed files with 32 additions and 26 deletions

View file

@ -0,0 +1,278 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use crate::rules::Symbol;
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, HashMap, VecDeque};
pub(crate) fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
keywords: &LookaheadSet,
minimize: bool,
) -> (LexTable, LexTable) {
let keyword_lex_table;
if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords);
keyword_lex_table = builder.table;
} else {
keyword_lex_table = LexTable::default();
}
let mut builder = LexTableBuilder::new(lexical_grammar);
for state in parse_table.states.iter_mut() {
let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| {
if token.is_terminal() {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(*token)
}
} else if token.is_eof() {
Some(*token)
} else {
None
}
}));
state.lex_state_id = builder.add_state_for_tokens(&tokens);
}
let mut table = builder.table;
if minimize {
minimize_lex_table(&mut table, parse_table);
}
(table, keyword_lex_table)
}
struct QueueEntry {
state_id: usize,
nfa_states: Vec<u32>,
eof_valid: bool,
}
struct LexTableBuilder<'a> {
lexical_grammar: &'a LexicalGrammar,
cursor: NfaCursor<'a>,
table: LexTable,
state_queue: VecDeque<QueueEntry>,
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
}
impl<'a> LexTableBuilder<'a> {
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
Self {
lexical_grammar,
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
table: LexTable::default(),
state_queue: VecDeque::new(),
state_ids_by_nfa_state_set: HashMap::new(),
}
}
fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize {
let mut eof_valid = false;
let nfa_states = tokens
.iter()
.filter_map(|token| {
if token.is_terminal() {
Some(self.lexical_grammar.variables[token.index].start_state)
} else {
eof_valid = true;
None
}
})
.collect();
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new {
info!(
"entry point state: {}, tokens: {:?}",
state_id,
tokens
.iter()
.map(|t| &self.lexical_grammar.variables[t.index].name)
.collect::<Vec<_>>()
);
}
while let Some(QueueEntry {
state_id,
nfa_states,
eof_valid,
}) = self.state_queue.pop_front()
{
self.populate_state(state_id, nfa_states, eof_valid);
}
state_id
}
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
self.cursor.reset(nfa_states);
match self
.state_ids_by_nfa_state_set
.entry((self.cursor.state_ids.clone(), eof_valid))
{
Entry::Occupied(o) => (*o.get(), false),
Entry::Vacant(v) => {
let state_id = self.table.states.len();
self.table.states.push(LexState::default());
self.state_queue.push_back(QueueEntry {
state_id,
nfa_states: v.key().0.clone(),
eof_valid,
});
v.insert(state_id);
(state_id, true)
}
}
}
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
self.cursor.force_reset(nfa_states);
// The EOF state is represented as an empty list of NFA states.
let mut completion = None;
for (id, prec) in self.cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if TokenConflictMap::prefer_token(
self.lexical_grammar,
(prev_precedence, prev_id),
(prec, id),
) {
continue;
}
}
completion = Some((id, prec));
}
info!(
"lex state: {}, completion: {:?}",
state_id,
completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec))
);
let transitions = self.cursor.transitions();
info!("lex state: {}, transitions: {:?}", state_id, transitions);
// If EOF is a valid lookahead token, add a transition predicated on the null
// character that leads to the empty set of NFA states.
if eof_valid {
let (next_state_id, _) = self.add_state(Vec::new(), false);
info!("lex state: {}, successor: EOF", state_id);
self.table.states[state_id].advance_actions.push((
CharacterSet::empty().add_char('\0'),
AdvanceAction {
state: Some(next_state_id),
in_main_token: true,
},
));
}
for NfaTransition {
characters,
precedence,
states,
is_separator,
} in transitions
{
if let Some((_, completed_precedence)) = completion {
if precedence < completed_precedence
|| (precedence == completed_precedence && is_separator)
{
continue;
}
}
let (next_state_id, _) = self.add_state(states, eof_valid && is_separator);
let next_state = if next_state_id == state_id {
None
} else {
Some(next_state_id)
};
self.table.states[state_id].advance_actions.push((
characters,
AdvanceAction {
state: next_state,
in_main_token: !is_separator,
},
));
}
if let Some((complete_id, _)) = completion {
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
} else if self.cursor.state_ids.is_empty() {
self.table.states[state_id].accept_action = Some(Symbol::end());
}
}
}
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
let mut state_replacements = BTreeMap::new();
let mut done = false;
while !done {
done = true;
for (i, state_i) in table.states.iter().enumerate() {
if state_replacements.contains_key(&i) {
continue;
}
for (j, state_j) in table.states.iter().enumerate() {
if j == i {
break;
}
if state_replacements.contains_key(&j) {
continue;
}
if state_i == state_j {
info!("replace state {} with state {}", i, j);
state_replacements.insert(i, j);
done = false;
break;
}
}
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = advance_action
.state
.map(|s| state_replacements.get(&s).cloned().unwrap_or(s))
}
}
}
let final_state_replacements = (0..table.states.len())
.into_iter()
.map(|state_id| {
let replacement = state_replacements
.get(&state_id)
.cloned()
.unwrap_or(state_id);
let prior_removed = state_replacements
.iter()
.take_while(|i| *i.0 < replacement)
.count();
replacement - prior_removed
})
.collect::<Vec<_>>();
for state in parse_table.states.iter_mut() {
state.lex_state_id = final_state_replacements[state.lex_state_id];
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = advance_action.state.map(|s| final_state_replacements[s]);
}
}
let mut i = 0;
table.states.retain(|_| {
let result = !state_replacements.contains_key(&i);
i += 1;
result
});
}

View file

@ -0,0 +1,735 @@
use super::item::{LookaheadSet, ParseItem, ParseItemSet};
use super::item_set_builder::ParseItemSetBuilder;
use crate::error::{Error, Result};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::{Alias, Associativity, Symbol, SymbolType};
use crate::tables::{
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use hashbrown::hash_map::Entry;
use hashbrown::{HashMap, HashSet};
use std::collections::hash_map::DefaultHasher;
use std::collections::VecDeque;
use std::fmt::Write;
use std::hash::Hasher;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
following_tokens: Vec<LookaheadSet>,
state_ids_to_log: Vec<ParseStateId>,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, Vec<LookaheadSet>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
// Add the error state at index 0.
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
// Add the starting state at index 1.
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(
ParseItem::start(),
LookaheadSet::with([Symbol::end()].iter().cloned()),
)]
.iter()
.cloned(),
),
);
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
if self.state_ids_to_log.contains(&entry.state_id) {
eprintln!(
"state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}",
entry.state_id,
super::item::ParseItemSetDisplay(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
self.lexical_grammar,
),
super::item::ParseItemSetDisplay(
&item_set,
self.syntax_grammar,
self.lexical_grammar,
)
);
}
self.add_actions(
entry.preceding_symbols,
entry.preceding_auxiliary_symbols,
entry.state_id,
item_set,
)?;
}
self.populate_used_symbols();
self.remove_precedences();
Ok((self.parse_table, self.following_tokens))
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
if preceding_symbols.len() > 1 {
let left_tokens = self
.item_set_builder
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
let right_tokens = self
.item_set_builder
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
self.following_tokens[left_token.index].insert_all(right_tokens);
}
}
}
let mut hasher = DefaultHasher::new();
item_set.hash_unfinished_items(&mut hasher);
let unfinished_item_signature = hasher.finish();
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_table.states.push(ParseState {
lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
unfinished_item_signature,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
state_id: ParseStateId,
item_set: ParseItemSet<'a>,
) -> Result<()> {
let mut terminal_successors = HashMap::new();
let mut non_terminal_successors = HashMap::new();
let mut lookaheads_with_conflicts = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
}
} else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
ParseAction::Reduce {
symbol: Symbol::non_terminal(item.variable_index as usize),
child_count: item.step_index as usize,
precedence: item.precedence(),
associativity: item.associativity(),
dynamic_precedence: item.production.dynamic_precedence,
alias_sequence_id: self.get_alias_sequence_id(item),
}
};
for lookahead in lookaheads.iter() {
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
entry.actions.clear();
entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
} else if action.precedence() == entry.actions[0].precedence() {
entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
}
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
}
for symbol in lookaheads_with_conflicts {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
)?;
}
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let reduce_precedence = entry.actions[0].precedence();
let mut considered_associativity = false;
let mut shift_precedence: Option<Range<i32>> = None;
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(step) = item.step() {
if item.step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
conflicting_items.insert(item);
let precedence = item.precedence();
if let Some(range) = &mut shift_precedence {
if precedence < range.start {
range.start = precedence;
} else if precedence > range.end {
range.end = precedence;
}
} else {
shift_precedence = Some(precedence..precedence);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
conflicting_items.insert(item);
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
let shift_precedence = shift_precedence.unwrap_or(0..0);
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index;
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
if conflicting_items
.iter()
.all(|item| item.variable_index == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
if shift_precedence.start > reduce_precedence
|| (shift_precedence.start == reduce_precedence
&& shift_precedence.end > reduce_precedence)
{
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_precedence.end < reduce_precedence
|| (shift_precedence.end == reduce_precedence
&& shift_precedence.start < reduce_precedence)
{
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if shift_precedence == (reduce_precedence..reduce_precedence) {
considered_associativity = true;
let mut has_left = false;
let mut has_right = false;
let mut has_non = false;
for action in &entry.actions {
if let ParseAction::Reduce { associativity, .. } = action {
match associativity {
Some(Associativity::Left) => has_left = true,
Some(Associativity::Right) => has_right = true,
None => has_non = true,
}
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n\n").unwrap();
for (i, item) in conflicting_items.iter().enumerate() {
write!(&mut msg, " {}:", i + 1).unwrap();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index as usize)
{
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut msg,
" ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
for (j, step) in item.production.steps.iter().enumerate() {
if j as u32 == item.step_index {
write!(&mut msg, "").unwrap();
}
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut msg, ")").unwrap();
if item.is_done() {
write!(
&mut msg,
" • {}",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
if precedence != 0 || associativity.is_some() {
write!(
&mut msg,
"(precedence: {}, associativity: {:?})",
precedence, associativity
)
.unwrap();
}
write!(&mut msg, "\n").unwrap();
}
let mut resolution_count = 0;
write!(&mut msg, "\nPossible resolutions:\n\n").unwrap();
let shift_items = conflicting_items
.iter()
.filter(|i| !i.is_done())
.cloned()
.collect::<Vec<_>>();
if shift_items.len() > 0 {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in",
resolution_count
)
.unwrap();
for (i, item) in shift_items.iter().enumerate() {
if i > 0 {
write!(&mut msg, " and").unwrap();
}
write!(
&mut msg,
" `{}`",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
write!(&mut msg, " than in the other rules.\n").unwrap();
}
if considered_associativity {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a left or right associativity in ",
resolution_count
)
.unwrap();
for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() {
if i > 0 {
write!(&mut msg, " and ").unwrap();
}
write!(
&mut msg,
"{}",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
write!(&mut msg, "\n").unwrap();
}
for item in &conflicting_items {
if item.is_done() {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in `{}` than in the other rules.\n",
resolution_count,
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
}
resolution_count += 1;
write!(
&mut msg,
" {}: Add a conflict for these rules: ",
resolution_count
)
.unwrap();
for (i, symbol) in actual_conflict.iter().enumerate() {
if i > 0 {
write!(&mut msg, ", ").unwrap();
}
write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap();
}
write!(&mut msg, "\n").unwrap();
Err(Error(msg))
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.keys()
.filter_map(|item| {
let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
{
Some(Symbol::non_terminal(variable_index))
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn populate_used_symbols(&mut self) {
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
for state in &self.parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::external(i));
}
}
self.parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::terminal(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::non_terminal(i));
}
}
}
fn remove_precedences(&mut self) {
for state in self.parse_table.states.iter_mut() {
for (_, entry) in state.terminal_entries.iter_mut() {
for action in entry.actions.iter_mut() {
match action {
ParseAction::Reduce {
precedence,
associativity,
..
} => {
*precedence = 0;
*associativity = None;
}
_ => {}
}
}
}
}
}
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
let mut alias_sequence: Vec<Option<Alias>> = item
.production
.steps
.iter()
.map(|s| s.alias.clone())
.collect();
while alias_sequence.last() == Some(&None) {
alias_sequence.pop();
}
if item.production.steps.len() > self.parse_table.max_aliased_production_length {
self.parse_table.max_aliased_production_length = item.production.steps.len()
}
if let Some(index) = self
.parse_table
.alias_sequences
.iter()
.position(|seq| *seq == alias_sequence)
{
index
} else {
self.parse_table.alias_sequences.push(alias_sequence);
self.parse_table.alias_sequences.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("\"{}\"", &variable.name)
}
}
}
}
}
pub(crate) fn build_parse_table(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
state_ids_to_log: Vec<usize>,
) -> Result<(ParseTable, Vec<LookaheadSet>)> {
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
state_ids_to_log,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
alias_sequences: Vec::new(),
max_aliased_production_length: 0,
},
following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()],
}
.build()
}

View file

@ -0,0 +1,71 @@
use crate::grammars::LexicalGrammar;
use crate::rules::Symbol;
use crate::tables::{ParseStateId, ParseTable};
use std::fmt;
pub(crate) struct CoincidentTokenIndex<'a> {
entries: Vec<Vec<ParseStateId>>,
grammar: &'a LexicalGrammar,
n: usize,
}
impl<'a> CoincidentTokenIndex<'a> {
pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
let n = lexical_grammar.variables.len();
let mut result = Self {
n,
grammar: lexical_grammar,
entries: vec![Vec::new(); n * n],
};
for (i, state) in table.states.iter().enumerate() {
for symbol in state.terminal_entries.keys() {
for other_symbol in state.terminal_entries.keys() {
let index = result.index(symbol.index, other_symbol.index);
if result.entries[index].last().cloned() != Some(i) {
result.entries[index].push(i);
}
}
}
}
result
}
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
&self.entries[self.index(a.index, b.index)]
}
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
!self.entries[self.index(a.index, b.index)].is_empty()
}
fn index(&self, a: usize, b: usize) -> usize {
if a < b {
a * self.n + b
} else {
b * self.n + a
}
}
}
impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "CoincidentTokenIndex {{\n")?;
write!(f, " entries: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.entries[self.index(i, j)].len()
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View file

@ -0,0 +1,446 @@
use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
use crate::rules::Associativity;
use crate::rules::{Symbol, SymbolType};
use smallbitvec::SmallBitVec;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::u32;
lazy_static! {
static ref START_PRODUCTION: Production = Production {
dynamic_precedence: 0,
steps: vec![ProductionStep {
symbol: Symbol {
index: 0,
kind: SymbolType::NonTerminal,
},
precedence: 0,
associativity: None,
alias: None,
}],
};
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub(crate) struct LookaheadSet {
terminal_bits: SmallBitVec,
external_bits: SmallBitVec,
eof: bool,
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct ParseItem<'a> {
pub variable_index: u32,
pub step_index: u32,
pub production: &'a Production,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSet<'a> {
pub entries: BTreeMap<ParseItem<'a>, LookaheadSet>,
}
pub(crate) struct ParseItemDisplay<'a>(
pub &'a ParseItem<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar);
#[allow(dead_code)]
pub(crate) struct ParseItemSetDisplay<'a>(
pub &'a ParseItemSet<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
impl LookaheadSet {
pub fn new() -> Self {
Self {
terminal_bits: SmallBitVec::new(),
external_bits: SmallBitVec::new(),
eof: false,
}
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
self.terminal_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::terminal(i))
} else {
None
}
})
.chain(
self.external_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::external(i))
} else {
None
}
}),
)
.chain(if self.eof { Some(Symbol::end()) } else { None })
}
pub fn with(symbols: impl IntoIterator<Item = Symbol>) -> Self {
let mut result = Self::new();
for symbol in symbols {
result.insert(symbol);
}
result
}
pub fn contains(&self, symbol: &Symbol) -> bool {
match symbol.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false),
SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false),
SymbolType::End => self.eof,
}
}
pub fn insert(&mut self, other: Symbol) {
let vec = match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => &mut self.terminal_bits,
SymbolType::External => &mut self.external_bits,
SymbolType::End => {
self.eof = true;
return;
}
};
if other.index >= vec.len() {
vec.resize(other.index + 1, false);
}
vec.set(other.index, true);
}
pub fn insert_all(&mut self, other: &LookaheadSet) -> bool {
let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() {
self.terminal_bits.resize(other.terminal_bits.len(), false);
}
if other.external_bits.len() > self.external_bits.len() {
self.external_bits.resize(other.external_bits.len(), false);
}
for (i, element) in other.terminal_bits.iter().enumerate() {
if element {
result |= !self.terminal_bits[i];
self.terminal_bits.set(i, element);
}
}
for (i, element) in other.external_bits.iter().enumerate() {
if element {
result |= !self.external_bits[i];
self.external_bits.set(i, element);
}
}
if other.eof {
result |= !self.eof;
self.eof = true;
}
result
}
}
impl<'a> ParseItem<'a> {
pub fn start() -> Self {
ParseItem {
variable_index: u32::MAX,
production: &START_PRODUCTION,
step_index: 0,
}
}
pub fn step(&self) -> Option<&'a ProductionStep> {
self.production.steps.get(self.step_index as usize)
}
pub fn symbol(&self) -> Option<Symbol> {
self.step().map(|step| step.symbol)
}
pub fn associativity(&self) -> Option<Associativity> {
self.prev_step().and_then(|step| step.associativity)
}
pub fn precedence(&self) -> i32 {
self.prev_step().map_or(0, |step| step.precedence)
}
pub fn prev_step(&self) -> Option<&'a ProductionStep> {
if self.step_index > 0 {
Some(&self.production.steps[self.step_index as usize - 1])
} else {
None
}
}
pub fn is_done(&self) -> bool {
self.step_index as usize == self.production.steps.len()
}
pub fn is_augmented(&self) -> bool {
self.variable_index == u32::MAX
}
pub fn successor(&self) -> ParseItem<'a> {
ParseItem {
variable_index: self.variable_index,
production: self.production,
step_index: self.step_index + 1,
}
}
}
impl<'a> ParseItemSet<'a> {
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, LookaheadSet)>) -> Self {
let mut result = Self::default();
for (item, lookaheads) in elements {
result.entries.insert(item, lookaheads);
}
result
}
pub fn hash_unfinished_items(&self, h: &mut impl Hasher) {
let mut previous_variable_index = u32::MAX;
let mut previous_step_index = u32::MAX;
for item in self.entries.keys() {
if item.step().is_none() && item.variable_index != previous_variable_index
|| item.step_index != previous_step_index
{
h.write_u32(item.variable_index);
h.write_u32(item.step_index);
previous_variable_index = item.variable_index;
previous_step_index = item.step_index;
}
}
}
}
impl<'a> Default for ParseItemSet<'a> {
fn default() -> Self {
Self {
entries: BTreeMap::new(),
}
}
}
#[allow(dead_code)]
impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if self.0.is_augmented() {
write!(f, "START →")?;
} else {
write!(
f,
"{} →",
&self.1.variables[self.0.variable_index as usize].name
)?;
}
for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize {
write!(f, "")?;
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
}
}
write!(f, " ")?;
if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", step.symbol.index)?;
}
} else if step.symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
}
if let Some(alias) = &step.alias {
write!(f, " (alias {})", alias.value)?;
}
}
if self.0.is_done() {
write!(f, "")?;
if let Some(step) = self.0.production.steps.last() {
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
}
}
}
Ok(())
}
}
impl<'a> fmt::Display for LookaheadSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", symbol.index)?;
}
} else if symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[symbol.index].name)?;
}
}
write!(f, "]")?;
Ok(())
}
}
impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for (item, lookaheads) in self.0.entries.iter() {
writeln!(
f,
"{}\t{}",
ParseItemDisplay(item, self.1, self.2),
LookaheadSetDisplay(lookaheads, self.1, self.2)
)?;
}
Ok(())
}
}
impl<'a> Hash for ParseItem<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_u32(self.variable_index);
hasher.write_u32(self.step_index);
hasher.write_i32(self.production.dynamic_precedence);
hasher.write_usize(self.production.steps.len());
hasher.write_i32(self.precedence());
self.associativity().hash(hasher);
for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher);
}
for step in &self.production.steps[self.step_index as usize..] {
step.hash(hasher);
}
}
}
impl<'a> PartialEq for ParseItem<'a> {
fn eq(&self, other: &Self) -> bool {
if self.variable_index != other.variable_index
|| self.step_index != other.step_index
|| self.production.dynamic_precedence != other.production.dynamic_precedence
|| self.production.steps.len() != other.production.steps.len()
|| self.precedence() != other.precedence()
|| self.associativity() != other.associativity()
{
return false;
}
for (i, step) in self.production.steps.iter().enumerate() {
if i < self.step_index as usize {
if step.alias != other.production.steps[i].alias {
return false;
}
} else {
if *step != other.production.steps[i] {
return false;
}
}
}
return true;
}
}
impl<'a> Ord for ParseItem<'a> {
fn cmp(&self, other: &Self) -> Ordering {
let o = self.variable_index.cmp(&other.variable_index);
if o != Ordering::Equal {
return o;
}
let o = self.step_index.cmp(&other.step_index);
if o != Ordering::Equal {
return o;
}
let o = self
.production
.dynamic_precedence
.cmp(&other.production.dynamic_precedence);
if o != Ordering::Equal {
return o;
}
let o = self
.production
.steps
.len()
.cmp(&other.production.steps.len());
if o != Ordering::Equal {
return o;
}
let o = self.precedence().cmp(&other.precedence());
if o != Ordering::Equal {
return o;
}
let o = self.associativity().cmp(&other.associativity());
if o != Ordering::Equal {
return o;
}
for (i, step) in self.production.steps.iter().enumerate() {
let o = if i < self.step_index as usize {
step.alias.cmp(&other.production.steps[i].alias)
} else {
step.cmp(&other.production.steps[i])
};
if o != Ordering::Equal {
return o;
}
}
return Ordering::Equal;
}
}
impl<'a> PartialOrd for ParseItem<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Eq for ParseItem<'a> {}
impl<'a> Hash for ParseItemSet<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len());
for (item, lookaheads) in self.entries.iter() {
item.hash(hasher);
lookaheads.hash(hasher);
}
}
}

View file

@ -0,0 +1,330 @@
use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::rules::Symbol;
use hashbrown::{HashMap, HashSet};
use std::fmt;
#[derive(Clone, Debug, PartialEq, Eq)]
struct TransitiveClosureAddition<'a> {
item: ParseItem<'a>,
info: FollowSetInfo,
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct FollowSetInfo {
lookaheads: LookaheadSet,
propagates_lookaheads: bool,
}
pub(crate) struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, LookaheadSet>,
last_sets: HashMap<Symbol, LookaheadSet>,
inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
}
fn find_or_push<T: Eq>(vector: &mut Vec<T>, value: T) {
if !vector.contains(&value) {
vector.push(value);
}
}
impl<'a> ParseItemSetBuilder<'a> {
pub fn new(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
) -> Self {
let mut result = Self {
syntax_grammar,
lexical_grammar,
first_sets: HashMap::new(),
last_sets: HashMap::new(),
inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
};
// For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions,
// respectively.
//
// For a terminal symbol, the FIRST and LAST set just consists of the
// terminal itself.
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
let mut set = LookaheadSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
for i in 0..syntax_grammar.external_tokens.len() {
let symbol = Symbol::external(i);
let mut set = LookaheadSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
// The FIRST set of a non-terminal `i` is the union of the following sets:
// * the set of all terminals that appear at the beginings of i's productions
// * the FIRST sets of all the non-terminals that appear at the beginnings
// of i's productions
//
// Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`.
let mut symbols_to_process = Vec::new();
let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i);
let first_set = &mut result
.first_sets
.entry(symbol)
.or_insert(LookaheadSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
first_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.first() {
symbols_to_process.push(step.symbol);
}
}
}
}
// The LAST set is defined in a similar way to the FIRST set.
let last_set = &mut result
.last_sets
.entry(symbol)
.or_insert(LookaheadSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
last_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.last() {
symbols_to_process.push(step.symbol);
}
}
}
}
}
// To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbols' productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things:
// * the set of non-terminal symbols that occur at each item's current position
// * the set of terminals that occurs after each of these non-terminal symbols
//
// So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when `i` occurs as the next symbol in one if its core items. The
// structure of an *addition* is as follows:
// * `item` - the new item that must be added as part of the expansion of `i`
// * `lookaheads` - lookahead tokens that can always come after that item in
// the expansion of `i`
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can
// occur at the *end* of the expansion of `i`, so that i's own current
// lookahead tokens can occur after `item`.
//
// Again, rather than computing these additions recursively, we use an explicit
// stack called `entries_to_process`.
for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = LookaheadSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow each non-terminal.
let mut follow_set_info_by_non_terminal = HashMap::new();
while let Some(entry) = entries_to_process.pop() {
let (variable_index, lookaheads, propagates_lookaheads) = entry;
let existing_info = follow_set_info_by_non_terminal
.entry(variable_index)
.or_insert_with(|| FollowSetInfo {
lookaheads: LookaheadSet::new(),
propagates_lookaheads: false,
});
let did_add_follow_set_info;
if propagates_lookaheads {
did_add_follow_set_info = !existing_info.propagates_lookaheads;
existing_info.propagates_lookaheads = true;
} else {
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
}
if did_add_follow_set_info {
for production in &syntax_grammar.variables[variable_index].productions {
if let Some(symbol) = production.first_symbol() {
if symbol.is_non_terminal() {
if production.steps.len() == 1 {
entries_to_process.push((
symbol.index,
lookaheads,
propagates_lookaheads,
));
} else {
entries_to_process.push((
symbol.index,
&result.first_sets[&production.steps[1].symbol],
false,
));
}
}
}
}
}
}
// Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32;
if syntax_grammar.variables_to_inline.contains(&non_terminal) {
continue;
}
for production in &variable.productions {
let item = ParseItem {
variable_index,
production,
step_index: 0,
};
if let Some(inlined_productions) =
inlines.inlined_productions(item.production, item.step_index)
{
for production in inlined_productions {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item: ParseItem {
variable_index,
production,
step_index: item.step_index,
},
info: follow_set_info.clone(),
},
);
}
} else {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item,
info: follow_set_info.clone(),
},
);
}
}
}
}
result
}
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
if let Some(productions) = self
.inlines
.inlined_productions(item.production, item.step_index)
{
for production in productions {
self.add_item(
&mut result,
ParseItem {
variable_index: item.variable_index,
production,
step_index: item.step_index,
},
lookaheads,
);
}
} else {
self.add_item(&mut result, *item, lookaheads);
}
}
result
}
pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet {
&self.first_sets[symbol]
}
pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet {
&self.first_sets[symbol]
}
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) {
if let Some(step) = item.step() {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step();
// Determine which tokens can follow this non-terminal.
let following_tokens = if let Some(next_step) = next_step {
self.first_sets.get(&next_step.symbol).unwrap()
} else {
&lookaheads
};
// Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] {
let lookaheads = set
.entries
.entry(addition.item)
.or_insert_with(|| LookaheadSet::new());
lookaheads.insert_all(&addition.info.lookaheads);
if addition.info.propagates_lookaheads {
lookaheads.insert_all(following_tokens);
}
}
}
}
set.entries.insert(item, lookaheads.clone());
}
}
impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ParseItemSetBuilder {{\n")?;
write!(f, " additions: {{\n")?;
for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
write!(f, " {}: {{\n", variable.name)?;
for addition in &self.transitive_closure_additions[i] {
write!(
f,
" {}\n",
ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar)
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View file

@ -0,0 +1,281 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
use hashbrown::{HashMap, HashSet};
pub(crate) fn minimize_parse_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap,
keywords: &LookaheadSet,
) {
let mut minimizer = Minimizer {
parse_table,
syntax_grammar,
token_conflict_map,
keywords,
simple_aliases,
};
minimizer.remove_unit_reductions();
minimizer.merge_compatible_states();
minimizer.remove_unused_states();
}
struct Minimizer<'a> {
parse_table: &'a mut ParseTable,
syntax_grammar: &'a SyntaxGrammar,
token_conflict_map: &'a TokenConflictMap<'a>,
keywords: &'a LookaheadSet,
simple_aliases: &'a AliasMap,
}
impl<'a> Minimizer<'a> {
fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
}
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
alias_sequence_id: 0,
symbol,
..
} => {
if !self.simple_aliases.contains_key(&symbol)
&& !aliased_symbols.contains(&symbol)
&& self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
continue;
}
}
_ => {}
}
only_unit_reductions = false;
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
}
}
}
for state in self.parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
} else {
other_state_id
}
})
}
}
}
fn merge_compatible_states(&mut self) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
state_ids_by_signature
.entry(state.unfinished_item_signature)
.or_insert(Vec::new())
.push(i);
}
let mut deleted_states = HashSet::new();
loop {
let mut state_replacements = HashMap::new();
for (_, state_ids) in &state_ids_by_signature {
for i in state_ids {
for j in state_ids {
if j == i {
break;
}
if deleted_states.contains(j) || deleted_states.contains(i) {
continue;
}
if self.merge_parse_state(*j, *i) {
deleted_states.insert(*i);
state_replacements.insert(*i, *j);
}
}
}
}
if state_replacements.is_empty() {
break;
}
for state in self.parse_table.states.iter_mut() {
state.update_referenced_states(|other_state_id, _| {
*state_replacements
.get(&other_state_id)
.unwrap_or(&other_state_id)
});
}
}
}
fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
let left_state = &self.parse_table.states[left];
let right_state = &self.parse_table.states[right];
if left_state.nonterminal_entries != right_state.nonterminal_entries {
return false;
}
for (symbol, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
if right_entry.actions != left_entry.actions {
return false;
}
} else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
return false;
}
}
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {
if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
return false;
}
symbols_to_add.push(*symbol);
}
}
for symbol in symbols_to_add {
let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
self.parse_table.states[left]
.terminal_entries
.insert(symbol, entry);
}
true
}
fn can_add_entry_to_state(
&self,
state: &ParseState,
token: Symbol,
entry: &ParseTableEntry,
) -> bool {
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if token.is_external() {
return false;
}
// Only merge_compatible_states parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
return false;
}
match entry.actions.last() {
Some(ParseAction::Reduce { .. }) => {}
_ => return false,
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if self
.syntax_grammar
.external_tokens
.iter()
.any(|t| t.corresponding_internal_token == Some(token))
{
return false;
}
let is_word_token = self.syntax_grammar.word_token == Some(token);
let is_keyword = self.keywords.contains(&token);
// Do not add a token if it conflicts with an existing token.
if token.is_terminal() {
for existing_token in state.terminal_entries.keys() {
if (is_word_token && self.keywords.contains(existing_token))
|| is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
{
continue;
}
if self
.token_conflict_map
.does_conflict(token.index, existing_token.index)
|| self
.token_conflict_map
.does_match_same_string(token.index, existing_token.index)
{
return false;
}
}
}
true
}
fn remove_unused_states(&mut self) {
let mut state_usage_map = vec![false; self.parse_table.states.len()];
state_usage_map[0] = true;
state_usage_map[1] = true;
for state in &self.parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
}
}
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; self.parse_table.states.len()];
for state_id in 0..self.parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < self.parse_table.states.len() {
if state_usage_map[original_state_id] {
self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
self.parse_table.states.remove(state_id);
}
original_state_id += 1;
}
}
}

285
cli/src/build_tables/mod.rs Normal file
View file

@ -0,0 +1,285 @@
mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::build_parse_table;
use self::coincident_tokens::CoincidentTokenIndex;
use self::item::LookaheadSet;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
inlines: &InlinedProductionMap,
minimize: bool,
state_ids_to_log: Vec<usize>,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
let (mut parse_table, following_tokens) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
lexical_grammar,
&parse_table,
syntax_grammar.word_token,
&token_conflict_map,
&coincident_token_index,
);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
);
mark_fragile_tokens(
&mut parse_table,
lexical_grammar,
&token_conflict_map,
);
if minimize {
minimize_parse_table(
&mut parse_table,
syntax_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
);
}
let (main_lex_table, keyword_lex_table) = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&keywords,
minimize,
);
Ok((
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way.
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_conflict(i, j)
});
if conflicts_with_other_tokens {
None
} else {
info!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
}));
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
if !conflict_free_tokens.contains(&symbol) {
if syntax_grammar.word_token != Some(symbol) {
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
info!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
}
info!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Option<Symbol>,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> LookaheadSet {
if word_token.is_none() {
return LookaheadSet::new();
}
let word_token = word_token.unwrap();
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
{
info!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None
}
},
));
// Exclude keyword candidates that shadow another keyword candidate.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_token in keywords.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
{
info!(
"Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
}));
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keywords.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
info!(
"Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
info!(
"Keywords - include {}",
lexical_grammar.variables[token.index].name,
);
true
}));
keywords
}
fn mark_fragile_tokens(
parse_table: &mut ParseTable,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap,
) {
let n = lexical_grammar.variables.len();
let mut valid_tokens_mask = Vec::with_capacity(n);
for state in parse_table.states.iter_mut() {
valid_tokens_mask.clear();
valid_tokens_mask.resize(n, false);
for token in state.terminal_entries.keys() {
if token.is_terminal() {
valid_tokens_mask[token.index] = true;
}
}
for (token, entry) in state.terminal_entries.iter_mut() {
for i in 0..n {
if token_conflict_map.does_overlap(i, token.index) {
if valid_tokens_mask[i] {
entry.reusable = false;
break;
}
}
}
}
}
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.transition_chars().all(|(chars, is_sep)| {
if is_sep {
true
} else if let CharacterSet::Include(chars) = chars {
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
} else {
false
}
})
}

View file

@ -0,0 +1,382 @@
use crate::build_tables::item::LookaheadSet;
use crate::grammars::LexicalGrammar;
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use hashbrown::HashSet;
use std::cmp::Ordering;
use std::fmt;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct TokenConflictStatus {
does_overlap: bool,
does_match_valid_continuation: bool,
does_match_separators: bool,
matches_same_string: bool,
}
pub(crate) struct TokenConflictMap<'a> {
n: usize,
status_matrix: Vec<TokenConflictStatus>,
starting_chars_by_index: Vec<CharacterSet>,
following_chars_by_index: Vec<CharacterSet>,
grammar: &'a LexicalGrammar,
}
impl<'a> TokenConflictMap<'a> {
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
let following_chars = get_following_chars(&starting_chars, following_tokens);
let n = grammar.variables.len();
let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
for i in 0..grammar.variables.len() {
for j in 0..i {
let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
status_matrix[matrix_index(n, i, j)] = status.0;
status_matrix[matrix_index(n, j, i)] = status.1;
}
}
TokenConflictMap {
n,
status_matrix,
starting_chars_by_index: starting_chars,
following_chars_by_index: following_chars,
grammar,
}
}
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
let left = &self.status_matrix[matrix_index(self.n, a, other)];
let right = &self.status_matrix[matrix_index(self.n, b, other)];
left == right
}
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
pub fn does_conflict(&self, i: usize, j: usize) -> bool {
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
entry.does_match_valid_continuation || entry.does_match_separators
}
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].does_overlap
}
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
if left.0 > right.0 {
return true;
} else if left.0 < right.0 {
return false;
}
match grammar.variables[left.1]
.implicit_precedence
.cmp(&grammar.variables[right.1].implicit_precedence)
{
Ordering::Less => false,
Ordering::Greater => true,
Ordering::Equal => left.1 < right.1,
}
}
}
impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TokenConflictMap {{\n")?;
write!(f, " starting_characters: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?;
}
write!(f, " }},\n")?;
write!(f, " following_characters: {{\n")?;
for i in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[i].name, self.following_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
write!(f, " status_matrix: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.status_matrix[matrix_index(self.n, i, j)]
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}
fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
variable_count * i + j
}
fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
let mut result = Vec::with_capacity(grammar.variables.len());
for variable in &grammar.variables {
cursor.reset(vec![variable.start_state]);
let mut all_chars = CharacterSet::empty();
for (chars, _) in cursor.transition_chars() {
all_chars = all_chars.add(chars);
}
result.push(all_chars);
}
result
}
fn get_following_chars(
starting_chars: &Vec<CharacterSet>,
following_tokens: Vec<LookaheadSet>,
) -> Vec<CharacterSet> {
following_tokens
.into_iter()
.map(|following_tokens| {
let mut chars = CharacterSet::empty();
for token in following_tokens.iter() {
if token.is_terminal() {
chars = chars.add(&starting_chars[token.index]);
}
}
chars
})
.collect()
}
fn compute_conflict_status(
cursor: &mut NfaCursor,
grammar: &LexicalGrammar,
following_chars: &Vec<CharacterSet>,
i: usize,
j: usize,
) -> (TokenConflictStatus, TokenConflictStatus) {
let mut visited_state_sets = HashSet::new();
let mut state_set_queue = vec![vec![
grammar.variables[i].start_state,
grammar.variables[j].start_state,
]];
let mut result = (
TokenConflictStatus::default(),
TokenConflictStatus::default(),
);
while let Some(state_set) = state_set_queue.pop() {
// Don't pursue states where there's no potential for conflict.
if variable_ids_for_states(&state_set, grammar).count() > 1 {
cursor.reset(state_set);
} else {
continue;
}
let mut completion = None;
for (id, precedence) in cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if id == prev_id {
continue;
}
// Prefer tokens with higher precedence. For tokens with equal precedence,
// prefer those listed earlier in the grammar.
let winning_id;
if TokenConflictMap::prefer_token(
grammar,
(prev_precedence, prev_id),
(precedence, id),
) {
winning_id = prev_id;
} else {
winning_id = id;
completion = Some((id, precedence));
}
if winning_id == i {
result.0.matches_same_string = true;
result.0.does_overlap = true;
} else {
result.1.matches_same_string = true;
result.1.does_overlap = true;
}
} else {
completion = Some((id, precedence));
}
}
for NfaTransition {
characters,
precedence,
states,
is_separator,
} in cursor.transitions()
{
let mut can_advance = true;
if let Some((completed_id, completed_precedence)) = completion {
let mut other_id = None;
let mut successor_contains_completed_id = false;
for variable_id in variable_ids_for_states(&states, grammar) {
if variable_id == completed_id {
successor_contains_completed_id = true;
break;
} else {
other_id = Some(variable_id);
}
}
if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
let winning_id;
if precedence < completed_precedence {
winning_id = completed_id;
can_advance = false;
} else {
winning_id = other_id;
}
if winning_id == i {
result.0.does_overlap = true;
if characters.does_intersect(&following_chars[j]) {
result.0.does_match_valid_continuation = true;
}
if is_separator {
result.0.does_match_separators = true;
}
} else {
result.1.does_overlap = true;
if characters.does_intersect(&following_chars[i]) {
result.1.does_match_valid_continuation = true;
}
}
}
}
if can_advance && visited_state_sets.insert(states.clone()) {
state_set_queue.push(states);
}
}
}
result
}
fn variable_ids_for_states<'a>(
state_ids: &'a Vec<u32>,
grammar: &'a LexicalGrammar,
) -> impl Iterator<Item = usize> + 'a {
let mut prev = None;
state_ids.iter().filter_map(move |state_id| {
let variable_id = grammar.variable_index_for_nfa_state(*state_id);
if prev != Some(variable_id) {
prev = Some(variable_id);
prev
} else {
None
}
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{Variable, VariableType};
use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
use crate::rules::{Rule, Symbol};
#[test]
fn test_starting_characters() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "token_0".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[a-f]1|0x\\d"),
},
Variable {
name: "token_1".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("d*ef"),
},
],
})
.unwrap();
let token_map = TokenConflictMap::new(&grammar, Vec::new());
assert_eq!(
token_map.starting_chars_by_index[0],
CharacterSet::empty().add_range('a', 'f').add_char('0')
);
assert_eq!(
token_map.starting_chars_by_index[1],
CharacterSet::empty().add_range('d', 'e')
);
}
#[test]
fn test_token_conflicts() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "in".to_string(),
kind: VariableType::Named,
rule: Rule::string("in"),
},
Variable {
name: "identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("\\w+"),
},
Variable {
name: "instanceof".to_string(),
kind: VariableType::Named,
rule: Rule::string("instanceof"),
},
],
})
.unwrap();
let var = |name| index_of_var(&grammar, name);
let token_map = TokenConflictMap::new(
&grammar,
vec![
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
],
);
// Given the string "in", the `in` token is preferred over the `identifier` token
assert!(token_map.does_match_same_string(var("in"), var("identifier")));
assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
// Depending on what character follows, the string "in" may be treated as part of an
// `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("in")));
// Depending on what character follows, the string "instanceof" may be treated as part of
// an `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
assert!(token_map.does_conflict(var("instanceof"), var("in")));
}
fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
grammar
.variables
.iter()
.position(|v| v.name == name)
.unwrap()
}
}

24
cli/src/error.rs Normal file
View file

@ -0,0 +1,24 @@
#[derive(Debug)]
pub struct Error(pub String);
pub type Result<T> = std::result::Result<T, Error>;
impl Error {
pub fn grammar(message: &str) -> Self {
Error(format!("Grammar error: {}", message))
}
pub fn regex(message: &str) -> Self {
Error(format!("Regex error: {}", message))
}
pub fn undefined_symbol(name: &str) -> Self {
Error(format!("Undefined symbol `{}`", name))
}
}
impl From<serde_json::Error> for Error {
fn from(error: serde_json::Error) -> Self {
Error(error.to_string())
}
}

34
cli/src/generate.rs Normal file
View file

@ -0,0 +1,34 @@
use crate::build_tables::build_tables;
use crate::error::Result;
use crate::parse_grammar::parse_grammar;
use crate::prepare_grammar::prepare_grammar;
use crate::render::render_c_code;
pub fn generate_parser_for_grammar(
input: &str,
minimize: bool,
state_ids_to_log: Vec<usize>,
) -> Result<String> {
let input_grammar = parse_grammar(input)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&inlines,
minimize,
state_ids_to_log,
)?;
let c_code = render_c_code(
&input_grammar.name,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
syntax_grammar,
lexical_grammar,
simple_aliases,
);
Ok(c_code)
}

204
cli/src/grammars.rs Normal file
View file

@ -0,0 +1,204 @@
use crate::nfa::Nfa;
use crate::rules::{Alias, Associativity, Rule, Symbol};
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum VariableType {
Hidden,
Auxiliary,
Anonymous,
Named,
}
// Input grammar
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct Variable {
pub name: String,
pub kind: VariableType,
pub rule: Rule,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct InputGrammar {
pub name: String,
pub variables: Vec<Variable>,
pub extra_tokens: Vec<Rule>,
pub expected_conflicts: Vec<Vec<String>>,
pub external_tokens: Vec<Rule>,
pub variables_to_inline: Vec<String>,
pub word_token: Option<String>,
}
// Extracted lexical grammar
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexicalVariable {
pub name: String,
pub kind: VariableType,
pub implicit_precedence: i32,
pub start_state: u32,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct LexicalGrammar {
pub nfa: Nfa,
pub variables: Vec<LexicalVariable>,
}
// Extracted syntax grammar
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct ProductionStep {
pub symbol: Symbol,
pub precedence: i32,
pub associativity: Option<Associativity>,
pub alias: Option<Alias>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct Production {
pub steps: Vec<ProductionStep>,
pub dynamic_precedence: i32,
}
pub(crate) struct InlinedProductionMap {
pub productions: Vec<Production>,
pub production_map: HashMap<(*const Production, u32), Vec<usize>>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct SyntaxVariable {
pub name: String,
pub kind: VariableType,
pub productions: Vec<Production>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ExternalToken {
pub name: String,
pub kind: VariableType,
pub corresponding_internal_token: Option<Symbol>,
}
#[derive(Debug)]
pub(crate) struct SyntaxGrammar {
pub variables: Vec<SyntaxVariable>,
pub extra_tokens: Vec<Symbol>,
pub expected_conflicts: Vec<Vec<Symbol>>,
pub external_tokens: Vec<ExternalToken>,
pub variables_to_inline: Vec<Symbol>,
pub word_token: Option<Symbol>,
}
#[cfg(test)]
impl ProductionStep {
pub(crate) fn new(symbol: Symbol) -> Self {
Self {
symbol,
precedence: 0,
associativity: None,
alias: None,
}
}
pub(crate) fn with_prec(self, precedence: i32, associativity: Option<Associativity>) -> Self {
Self {
symbol: self.symbol,
precedence,
associativity,
alias: self.alias,
}
}
pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self {
Self {
symbol: self.symbol,
precedence: self.precedence,
associativity: self.associativity,
alias: Some(Alias {
value: value.to_string(),
is_named,
}),
}
}
}
impl Production {
pub fn first_symbol(&self) -> Option<Symbol> {
self.steps.first().map(|s| s.symbol.clone())
}
}
impl Default for Production {
fn default() -> Self {
Production {
dynamic_precedence: 0,
steps: Vec::new(),
}
}
}
#[cfg(test)]
impl Variable {
pub fn named(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Named,
rule,
}
}
pub fn auxiliary(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Auxiliary,
rule,
}
}
pub fn hidden(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Hidden,
rule,
}
}
pub fn anonymous(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Anonymous,
rule,
}
}
}
impl LexicalGrammar {
pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize {
self.variables.iter().position(|v| v.start_state >= state_id).unwrap()
}
}
impl SyntaxVariable {
pub fn is_auxiliary(&self) -> bool {
self.kind == VariableType::Auxiliary
}
}
impl InlinedProductionMap {
pub fn inlined_productions<'a>(
&'a self,
production: &Production,
step_index: u32,
) -> Option<impl Iterator<Item = &'a Production> + 'a> {
self.production_map
.get(&(production as *const Production, step_index))
.map(|production_indices| {
production_indices
.iter()
.cloned()
.map(move |index| &self.productions[index])
})
}
}

334
cli/src/js/dsl.js Normal file
View file

@ -0,0 +1,334 @@
const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi;
const DELIMITER_ESCAPE_PATTERN = /\\\//g;
function alias(rule, value) {
const result = {
type: "ALIAS",
content: normalize(rule),
named: false,
value: null
};
switch (value.constructor) {
case String:
result.named = false;
result.value = value;
return result;
case ReferenceError:
result.named = true;
result.value = value.symbol.name;
return result;
case Object:
if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true;
result.value = value.name;
return result;
}
}
throw new Error('Invalid alias value ' + value);
}
function blank() {
return {
type: "BLANK"
};
}
function choice(...elements) {
return {
type: "CHOICE",
members: elements.map(normalize)
};
}
function optional(value) {
return choice(value, blank());
}
function prec(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC",
value: number,
content: normalize(rule)
};
}
prec.left = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_LEFT",
value: number,
content: normalize(rule)
};
}
prec.right = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_RIGHT",
value: number,
content: normalize(rule)
};
}
prec.dynamic = function(number, rule) {
return {
type: "PREC_DYNAMIC",
value: number,
content: normalize(rule)
};
}
function repeat(rule) {
return {
type: "REPEAT",
content: normalize(rule)
};
}
function repeat1(rule) {
return {
type: "REPEAT1",
content: normalize(rule)
};
}
function seq(...elements) {
return {
type: "SEQ",
members: elements.map(normalize)
};
}
function sym(name) {
return {
type: "SYMBOL",
name: name
};
}
function token(value) {
return {
type: "TOKEN",
content: normalize(value)
};
}
token.immediate = function(value) {
return {
type: "IMMEDIATE_TOKEN",
content: normalize(value)
};
}
function normalize(value) {
if (typeof value == "undefined")
throw new Error("Undefined symbol");
switch (value.constructor) {
case String:
return {
type: 'STRING',
value
};
case RegExp:
return {
type: 'PATTERN',
value: value.source
.replace(
DELIMITER_ESCAPE_PATTERN,
'/'
)
.replace(
UNICODE_ESCAPE_PATTERN,
(match, group) => String.fromCharCode(parseInt(group, 16))
)
};
case ReferenceError:
throw value
default:
if (typeof value.type === 'string') {
return value;
} else {
throw new TypeError("Invalid rule: " + value.toString());
}
}
}
function RuleBuilder(ruleMap) {
return new Proxy({}, {
get(target, propertyName) {
const symbol = {
type: 'SYMBOL',
name: propertyName
};
if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) {
return symbol;
} else {
const error = new ReferenceError(`Undefined symbol '${propertyName}'`);
error.symbol = symbol;
return error;
}
}
})
}
function grammar(baseGrammar, options) {
if (!options) {
options = baseGrammar;
baseGrammar = {
name: null,
rules: {},
extras: [normalize(/\s/)],
conflicts: [],
externals: [],
inline: []
};
}
let externals = baseGrammar.externals;
if (options.externals) {
if (typeof options.externals !== "function") {
throw new Error("Grammar's 'externals' property must be a function.");
}
const externalsRuleBuilder = RuleBuilder(null)
const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals);
if (!Array.isArray(externalRules)) {
throw new Error("Grammar's 'externals' property must return an array of rules.");
}
externals = externalRules.map(normalize);
}
const ruleMap = {};
for (const key in options.rules) {
ruleMap[key] = true;
}
for (const key in baseGrammar.rules) {
ruleMap[key] = true;
}
for (const external of externals) {
if (typeof external.name === 'string') {
ruleMap[external.name] = true;
}
}
const ruleBuilder = RuleBuilder(ruleMap);
const name = options.name;
if (typeof name !== "string") {
throw new Error("Grammar's 'name' property must be a string.");
}
if (!/^[a-zA-Z_]\w*$/.test(name)) {
throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
}
let rules = Object.assign({}, baseGrammar.rules);
if (options.rules) {
if (typeof options.rules !== "object") {
throw new Error("Grammar's 'rules' property must be an object.");
}
for (const ruleName in options.rules) {
const ruleFn = options.rules[ruleName];
if (typeof ruleFn !== "function") {
throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not.");
}
rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]));
}
}
let extras = baseGrammar.extras.slice();
if (options.extras) {
if (typeof options.extras !== "function") {
throw new Error("Grammar's 'extras' property must be a function.");
}
extras = options.extras
.call(ruleBuilder, ruleBuilder, baseGrammar.extras)
.map(normalize);
}
let word = baseGrammar.word;
if (options.word) {
word = options.word.call(ruleBuilder, ruleBuilder).name;
if (typeof word != 'string') {
throw new Error("Grammar's 'word' property must be a named rule.");
}
}
let conflicts = baseGrammar.conflicts;
if (options.conflicts) {
if (typeof options.conflicts !== "function") {
throw new Error("Grammar's 'conflicts' property must be a function.");
}
const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym));
const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules);
if (!Array.isArray(conflictRules)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
conflicts = conflictRules.map(conflictSet => {
if (!Array.isArray(conflictSet)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
return conflictSet.map(symbol => symbol.name);
});
}
let inline = baseGrammar.inline;
if (options.inline) {
if (typeof options.inline !== "function") {
throw new Error("Grammar's 'inline' property must be a function.");
}
const baseInlineRules = baseGrammar.inline.map(sym);
const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules);
if (!Array.isArray(inlineRules)) {
throw new Error("Grammar's inline must be an array of rules.");
}
inline = inlineRules.map(symbol => symbol.name);
}
if (Object.keys(rules).length == 0) {
throw new Error("Grammar must have at least one rule.");
}
return {name, word, rules, extras, conflicts, externals, inline};
}
global.alias = alias;
global.blank = blank;
global.choice = choice;
global.optional = optional;
global.prec = prec;
global.repeat = repeat;
global.repeat1 = repeat1;
global.seq = seq;
global.sym = sym;
global.token = token;
global.grammar = grammar;

29
cli/src/logger.rs Normal file
View file

@ -0,0 +1,29 @@
use log::{LevelFilter, Log, Metadata, Record};
struct Logger {
pub filter: Option<String>,
}
impl Log for Logger {
fn enabled(&self, _: &Metadata) -> bool {
true
}
fn log(&self, record: &Record) {
eprintln!(
"[{}] {}",
record
.module_path()
.unwrap_or_default()
.trim_start_matches("rust_tree_sitter_cli::"),
record.args()
);
}
fn flush(&self) {}
}
pub(crate) fn init() {
log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap();
log::set_max_level(LevelFilter::Info);
}

119
cli/src/main.rs Normal file
View file

@ -0,0 +1,119 @@
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate log;
#[macro_use]
extern crate serde_derive;
extern crate hashbrown;
extern crate serde_json;
use clap::{App, Arg, SubCommand};
use std::env;
use std::io::Write;
use std::path::PathBuf;
use std::process::{exit, Command, Stdio};
use std::usize;
mod build_tables;
mod error;
mod generate;
mod grammars;
mod logger;
mod nfa;
mod parse_grammar;
mod prepare_grammar;
mod render;
mod rules;
mod tables;
fn main() {
if let Err(e) = run() {
eprintln!("{}", e.0);
exit(1);
}
}
fn run() -> error::Result<()> {
let matches = App::new("tree-sitter")
.version("0.1")
.author("Max Brunsfeld <maxbrunsfeld@gmail.com>")
.about("Generates and tests parsers")
.subcommand(
SubCommand::with_name("generate")
.about("Generate a parser")
.arg(Arg::with_name("log").long("log"))
.arg(
Arg::with_name("state-ids-to-log")
.long("log-state")
.takes_value(true),
)
.arg(Arg::with_name("no-minimize").long("no-minimize")),
)
.subcommand(
SubCommand::with_name("parse")
.about("Parse a file")
.arg(Arg::with_name("path").index(1)),
)
.subcommand(
SubCommand::with_name("test")
.about("Run a parser's tests")
.arg(Arg::with_name("path").index(1).required(true))
.arg(Arg::with_name("line").index(2).required(true))
.arg(Arg::with_name("column").index(3).required(true)),
)
.get_matches();
if let Some(matches) = matches.subcommand_matches("generate") {
if matches.is_present("log") {
logger::init();
}
let minimize = !matches.is_present("no-minimize");
let state_ids_to_log = matches
.values_of("state-ids-to-log")
.map_or(Vec::new(), |ids| {
ids.filter_map(|id| usize::from_str_radix(id, 10).ok())
.collect()
});
let mut grammar_path = env::current_dir().expect("Failed to read CWD");
grammar_path.push("grammar.js");
let grammar_json = load_js_grammar_file(grammar_path);
let code =
generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?;
println!("{}", code);
}
Ok(())
}
fn load_js_grammar_file(grammar_path: PathBuf) -> String {
let mut node_process = Command::new("node")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.expect("Failed to run `node`");
let js_prelude = include_str!("./js/dsl.js");
let mut node_stdin = node_process
.stdin
.take()
.expect("Failed to open stdin for node");
write!(
node_stdin,
"{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n",
js_prelude,
grammar_path.to_str().unwrap()
)
.expect("Failed to write to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()
.expect("Failed to read output from node");
match output.status.code() {
None => panic!("Node process was killed"),
Some(0) => {}
Some(code) => panic!(format!("Node process exited with status {}", code)),
}
String::from_utf8(output.stdout).expect("Got invalid UTF8 from node")
}

771
cli/src/nfa.rs Normal file
View file

@ -0,0 +1,771 @@
use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::fmt;
use std::mem::swap;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CharacterSet {
Include(Vec<char>),
Exclude(Vec<char>),
}
#[derive(Debug, PartialEq, Eq)]
pub enum NfaState {
Advance {
chars: CharacterSet,
state_id: u32,
is_sep: bool,
precedence: i32,
},
Split(u32, u32),
Accept {
variable_index: usize,
precedence: i32,
},
}
#[derive(PartialEq, Eq)]
pub struct Nfa {
pub states: Vec<NfaState>,
}
#[derive(Debug)]
pub struct NfaCursor<'a> {
pub(crate) state_ids: Vec<u32>,
nfa: &'a Nfa,
}
#[derive(Debug, PartialEq, Eq)]
pub struct NfaTransition {
pub characters: CharacterSet,
pub is_separator: bool,
pub precedence: i32,
pub states: Vec<u32>,
}
impl Default for Nfa {
fn default() -> Self {
Self { states: Vec::new() }
}
}
impl CharacterSet {
pub fn empty() -> Self {
CharacterSet::Include(Vec::new())
}
pub fn all() -> Self {
CharacterSet::Exclude(Vec::new())
}
pub fn negate(self) -> CharacterSet {
match self {
CharacterSet::Include(chars) => CharacterSet::Exclude(chars),
CharacterSet::Exclude(chars) => CharacterSet::Include(chars),
}
}
pub fn add_char(self, c: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
if let Err(i) = chars.binary_search(&c) {
chars.insert(i, c);
}
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add_range(self, start: char, end: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
let mut c = start as u32;
while c <= end as u32 {
chars.push(char::from_u32(c).unwrap());
c += 1;
}
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add(self, other: &CharacterSet) -> Self {
match self {
CharacterSet::Include(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
}
CharacterSet::Exclude(other_chars) => {
let excluded_chars = other_chars
.iter()
.cloned()
.filter(|c| !chars.contains(&c))
.collect();
CharacterSet::Exclude(excluded_chars)
}
},
CharacterSet::Exclude(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.retain(|c| !other_chars.contains(&c));
CharacterSet::Exclude(chars)
}
CharacterSet::Exclude(other_chars) => {
chars.retain(|c| other_chars.contains(&c));
CharacterSet::Exclude(chars)
}
},
}
}
pub fn does_intersect(&self, other: &CharacterSet) -> bool {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common,
CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only,
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only,
CharacterSet::Exclude(_) => true,
},
}
}
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => {
CharacterSet::Include(remove_chars(chars, other_chars, true))
}
CharacterSet::Exclude(other_chars) => {
let mut removed = remove_chars(chars, other_chars, false);
add_chars(other_chars, chars);
swap(&mut removed, chars);
CharacterSet::Include(removed)
}
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => {
let mut removed = remove_chars(other_chars, chars, false);
add_chars(chars, other_chars);
swap(&mut removed, other_chars);
CharacterSet::Include(removed)
}
CharacterSet::Exclude(other_chars) => {
let mut result_exclusion = chars.clone();
result_exclusion.extend(other_chars.iter().cloned());
result_exclusion.sort_unstable();
result_exclusion.dedup();
remove_chars(chars, other_chars, true);
let mut included_characters = Vec::new();
let mut other_included_characters = Vec::new();
swap(&mut included_characters, other_chars);
swap(&mut other_included_characters, chars);
*self = CharacterSet::Include(included_characters);
*other = CharacterSet::Include(other_included_characters);
CharacterSet::Exclude(result_exclusion)
}
},
}
}
pub fn is_empty(&self) -> bool {
if let CharacterSet::Include(c) = self {
c.is_empty()
} else {
false
}
}
pub fn contains(&self, c: char) -> bool {
match self {
CharacterSet::Include(chars) => chars.contains(&c),
CharacterSet::Exclude(chars) => !chars.contains(&c),
}
}
}
impl Ord for CharacterSet {
fn cmp(&self, other: &CharacterSet) -> Ordering {
match self {
CharacterSet::Include(chars) => {
if let CharacterSet::Include(other_chars) = other {
order_chars(chars, other_chars)
} else {
Ordering::Less
}
}
CharacterSet::Exclude(chars) => {
if let CharacterSet::Exclude(other_chars) = other {
order_chars(chars, other_chars)
} else {
Ordering::Greater
}
}
}
}
}
impl PartialOrd for CharacterSet {
fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
Some(self.cmp(other))
}
}
fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
for c in right {
match left.binary_search(c) {
Err(i) => left.insert(i, *c),
_ => {}
}
}
}
fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
let mut result = Vec::new();
right.retain(|right_char| {
if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
left.remove(index);
result.push(*right_char);
false || !mutate_right
} else {
true
}
});
result
}
struct SetComparision {
left_only: bool,
common: bool,
right_only: bool,
}
fn compare_chars(left: &Vec<char>, right: &Vec<char>) -> SetComparision {
let mut result = SetComparision {
left_only: false,
common: false,
right_only: false,
};
let mut left = left.iter().cloned();
let mut right = right.iter().cloned();
let mut i = left.next();
let mut j = right.next();
while let (Some(left_char), Some(right_char)) = (i, j) {
if left_char < right_char {
i = left.next();
result.left_only = true;
} else if left_char > right_char {
j = right.next();
result.right_only = true;
} else {
i = left.next();
j = right.next();
result.common = true;
}
}
result
}
fn order_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
if chars.is_empty() {
if other_chars.is_empty() {
Ordering::Equal
} else {
Ordering::Less
}
} else if other_chars.is_empty() {
Ordering::Greater
} else {
let cmp = chars.len().cmp(&other_chars.len());
if cmp != Ordering::Equal {
return cmp;
}
for (c, other_c) in chars.iter().zip(other_chars.iter()) {
let cmp = c.cmp(other_c);
if cmp != Ordering::Equal {
return cmp;
}
}
Ordering::Equal
}
}
impl Nfa {
pub fn new() -> Self {
Nfa { states: Vec::new() }
}
pub fn last_state_id(&self) -> u32 {
self.states.len() as u32 - 1
}
}
impl fmt::Debug for Nfa {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Nfa {{ states: {{\n")?;
for (i, state) in self.states.iter().enumerate() {
write!(f, " {}: {:?},\n", i, state)?;
}
write!(f, "}} }}")?;
Ok(())
}
}
impl<'a> NfaCursor<'a> {
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
let mut result = Self {
nfa,
state_ids: Vec::new(),
};
result.add_states(&mut states);
result
}
pub fn reset(&mut self, mut states: Vec<u32>) {
self.state_ids.clear();
self.add_states(&mut states);
}
pub fn force_reset(&mut self, states: Vec<u32>) {
self.state_ids = states
}
pub fn transition_chars(&self) -> impl Iterator<Item = (&CharacterSet, bool)> {
self.raw_transitions().map(|t| (t.0, t.1))
}
pub fn transitions(&self) -> Vec<NfaTransition> {
Self::group_transitions(self.raw_transitions())
}
fn raw_transitions(&self) -> impl Iterator<Item = (&CharacterSet, bool, i32, u32)> {
self.state_ids.iter().filter_map(move |id| {
if let NfaState::Advance {
chars,
state_id,
precedence,
is_sep,
} = &self.nfa.states[*id as usize]
{
Some((chars, *is_sep, *precedence, *state_id))
} else {
None
}
})
}
fn group_transitions<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, bool, i32, u32)>,
) -> Vec<NfaTransition> {
let mut result: Vec<NfaTransition> = Vec::new();
for (chars, is_sep, prec, state) in iter {
let mut chars = chars.clone();
let mut i = 0;
while i < result.len() && !chars.is_empty() {
let intersection = result[i].characters.remove_intersection(&mut chars);
if !intersection.is_empty() {
let mut intersection_states = result[i].states.clone();
match intersection_states.binary_search(&state) {
Err(j) => intersection_states.insert(j, state),
_ => {}
}
let intersection_transition = NfaTransition {
characters: intersection,
is_separator: result[i].is_separator || is_sep,
precedence: max(result[i].precedence, prec),
states: intersection_states,
};
if result[i].characters.is_empty() {
result[i] = intersection_transition;
} else {
result.insert(i, intersection_transition);
i += 1;
}
}
i += 1;
}
if !chars.is_empty() {
result.push(NfaTransition {
characters: chars,
precedence: prec,
states: vec![state],
is_separator: is_sep,
});
}
}
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
result
}
pub fn completions(&self) -> impl Iterator<Item = (usize, i32)> + '_ {
self.state_ids.iter().filter_map(move |state_id| {
if let NfaState::Accept {
variable_index,
precedence,
} = self.nfa.states[*state_id as usize]
{
Some((variable_index, precedence))
} else {
None
}
})
}
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
let mut i = 0;
while i < new_state_ids.len() {
let state_id = new_state_ids[i];
let state = &self.nfa.states[state_id as usize];
if let NfaState::Split(left, right) = state {
let mut has_left = false;
let mut has_right = false;
for new_state_id in new_state_ids.iter() {
if *new_state_id == *left {
has_left = true;
}
if *new_state_id == *right {
has_right = true;
}
}
if !has_left {
new_state_ids.push(*left);
}
if !has_right {
new_state_ids.push(*right);
}
} else if let Err(i) = self.state_ids.binary_search(&state_id) {
self.state_ids.insert(i, state_id);
}
i += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_transitions() {
let table = [
// overlapping character classes
(
vec![
(CharacterSet::empty().add_range('a', 'f'), false, 0, 1),
(CharacterSet::empty().add_range('d', 'i'), false, 1, 2),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_range('a', 'c'),
is_separator: false,
precedence: 0,
states: vec![1],
},
NfaTransition {
characters: CharacterSet::empty().add_range('d', 'f'),
is_separator: false,
precedence: 1,
states: vec![1, 2],
},
NfaTransition {
characters: CharacterSet::empty().add_range('g', 'i'),
is_separator: false,
precedence: 1,
states: vec![2],
},
],
),
// large character class followed by many individual characters
(
vec![
(CharacterSet::empty().add_range('a', 'z'), false, 0, 1),
(CharacterSet::empty().add_char('d'), false, 0, 2),
(CharacterSet::empty().add_char('i'), false, 0, 3),
(CharacterSet::empty().add_char('f'), false, 0, 4),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_char('d'),
is_separator: false,
precedence: 0,
states: vec![1, 2],
},
NfaTransition {
characters: CharacterSet::empty().add_char('f'),
is_separator: false,
precedence: 0,
states: vec![1, 4],
},
NfaTransition {
characters: CharacterSet::empty().add_char('i'),
is_separator: false,
precedence: 0,
states: vec![1, 3],
},
NfaTransition {
characters: CharacterSet::empty()
.add_range('a', 'c')
.add_char('e')
.add_range('g', 'h')
.add_range('j', 'z'),
is_separator: false,
precedence: 0,
states: vec![1],
},
],
),
// negated character class followed by an individual character
(
vec![
(CharacterSet::empty().add_char('0'), false, 0, 1),
(CharacterSet::empty().add_char('b'), false, 0, 2),
(
CharacterSet::empty().add_range('a', 'f').negate(),
false,
0,
3,
),
(CharacterSet::empty().add_char('c'), false, 0, 4),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_char('0'),
precedence: 0,
states: vec![1, 3],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty().add_char('b'),
precedence: 0,
states: vec![2],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty().add_char('c'),
precedence: 0,
states: vec![4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty()
.add_range('a', 'f')
.add_char('0')
.negate(),
precedence: 0,
states: vec![3],
is_separator: false,
},
],
),
// multiple negated character classes
(
vec![
(CharacterSet::Include(vec!['a']), false, 0, 1),
(CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2),
(CharacterSet::Include(vec!['g']), false, 0, 6),
(CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3),
(CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4),
(CharacterSet::Include(vec!['g']), false, 0, 5),
],
vec![
NfaTransition {
characters: CharacterSet::Include(vec!['a']),
precedence: 0,
states: vec![1, 3, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['g']),
precedence: 0,
states: vec![2, 3, 5, 6],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['b', 'c']),
precedence: 0,
states: vec![3, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['h', 'i']),
precedence: 0,
states: vec![2, 3],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['d', 'e', 'f']),
precedence: 0,
states: vec![2, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Exclude(vec![
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
]),
precedence: 0,
states: vec![2, 3, 4],
is_separator: false,
},
],
),
];
for row in table.iter() {
assert_eq!(
NfaCursor::group_transitions(row.0.iter().map(|(c, sep, p, s)| (c, *sep, *p, *s))),
row.1
);
}
}
#[test]
fn test_character_set_remove_intersection() {
// A whitelist and an overlapping whitelist.
// Both sets contain 'c', 'd', and 'f'
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
// A whitelist and a larger whitelist.
let mut a = CharacterSet::empty().add_char('c');
let mut b = CharacterSet::empty().add_range('a', 'e');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_char('c')
);
assert_eq!(a, CharacterSet::empty());
assert_eq!(
b,
CharacterSet::empty()
.add_range('a', 'b')
.add_range('d', 'e')
);
let mut a = CharacterSet::empty().add_char('c');
let mut b = CharacterSet::empty().add_range('a', 'e');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_char('c')
);
assert_eq!(a, CharacterSet::empty());
assert_eq!(
b,
CharacterSet::empty()
.add_range('a', 'b')
.add_range('d', 'e')
);
// A whitelist and an intersecting blacklist.
// Both sets contain 'e', 'f', and 'm'
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
// A blacklist and an overlapping blacklist.
// Both sets exclude 'c', 'd', and 'e'
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('a', 'h').negate(),
);
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
// A blacklist and a larger blacklist.
let mut a = CharacterSet::empty().add_range('b', 'c').negate();
let mut b = CharacterSet::empty().add_range('a', 'd').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('a', 'd').negate(),
);
assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d'));
assert_eq!(b, CharacterSet::empty());
}
#[test]
fn test_character_set_does_intersect() {
let (a, b) = (CharacterSet::empty(), CharacterSet::empty());
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::empty().add_char('a'),
CharacterSet::empty().add_char('a'),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
let (a, b) = (
CharacterSet::empty().add_char('b'),
CharacterSet::empty().add_char('a').add_char('c'),
);
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::Include(vec!['b']),
CharacterSet::Exclude(vec!['a', 'b', 'c']),
);
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::Include(vec!['b']),
CharacterSet::Exclude(vec!['a', 'c']),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
let (a, b) = (
CharacterSet::Exclude(vec!['a']),
CharacterSet::Exclude(vec!['a']),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
}
}

167
cli/src/parse_grammar.rs Normal file
View file

@ -0,0 +1,167 @@
use serde_json::{Map, Value};
use crate::error::Result;
use crate::grammars::{InputGrammar, Variable, VariableType};
use crate::rules::Rule;
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: i32,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: i32,
content: Box<RuleJSON>,
},
PREC {
value: i32,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
struct GrammarJSON {
name: String,
rules: Map<String, Value>,
conflicts: Option<Vec<Vec<String>>>,
externals: Option<Vec<RuleJSON>>,
extras: Option<Vec<RuleJSON>>,
inline: Option<Vec<String>>,
word: Option<String>,
}
pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
let grammar_json: GrammarJSON = serde_json::from_str(&input)?;
let mut variables = Vec::with_capacity(grammar_json.rules.len());
for (name, value) in grammar_json.rules {
variables.push(Variable {
name: name.to_owned(),
kind: VariableType::Named,
rule: parse_rule(serde_json::from_value(value)?),
})
}
let extra_tokens = grammar_json.extras
.unwrap_or(Vec::new())
.into_iter()
.map(parse_rule)
.collect();
let external_tokens = grammar_json.externals
.unwrap_or(Vec::new())
.into_iter()
.map(parse_rule)
.collect();
let expected_conflicts = grammar_json.conflicts
.unwrap_or(Vec::new());
let variables_to_inline = grammar_json.inline
.unwrap_or(Vec::new());
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
variables,
extra_tokens,
expected_conflicts,
external_tokens,
variables_to_inline,
})
}
fn parse_rule(json: RuleJSON) -> Rule {
match json {
RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value } => Rule::Pattern(value),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()),
RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)),
RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]),
RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)),
RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)),
RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)),
RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)),
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#).unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(grammar.variables, vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]);
}
}

View file

@ -0,0 +1,241 @@
use super::ExtractedSyntaxGrammar;
use crate::grammars::{Variable, VariableType};
use crate::rules::{Rule, Symbol};
use hashbrown::HashMap;
use std::mem;
struct Expander {
variable_name: String,
repeat_count_in_variable: usize,
preceding_symbol_count: usize,
auxiliary_variables: Vec<Variable>,
existing_repeats: HashMap<Rule, Symbol>,
}
impl Expander {
fn expand_variable(&mut self, variable: &mut Variable) {
self.variable_name.clear();
self.variable_name.push_str(&variable.name);
self.repeat_count_in_variable = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.expand_rule(&rule);
}
fn expand_rule(&mut self, rule: &Rule) -> Rule {
match rule {
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Repeat(content) => {
let inner_rule = self.expand_rule(content);
if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) {
return Rule::Symbol(*existing_symbol);
}
self.repeat_count_in_variable += 1;
let rule_name = format!(
"{}_repeat{}",
self.variable_name, self.repeat_count_in_variable
);
let repeat_symbol = Symbol::non_terminal(
self.preceding_symbol_count + self.auxiliary_variables.len(),
);
self.existing_repeats
.insert(inner_rule.clone(), repeat_symbol);
self.auxiliary_variables.push(Variable {
name: rule_name,
kind: VariableType::Auxiliary,
rule: Rule::Choice(vec![
Rule::Seq(vec![
Rule::Symbol(repeat_symbol),
Rule::Symbol(repeat_symbol),
]),
inner_rule,
]),
});
Rule::Symbol(repeat_symbol)
}
Rule::Metadata { rule, params } => Rule::Metadata {
rule: Box::new(self.expand_rule(rule)),
params: params.clone(),
},
_ => rule.clone(),
}
}
}
pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar {
let mut expander = Expander {
variable_name: String::new(),
repeat_count_in_variable: 0,
preceding_symbol_count: grammar.variables.len(),
auxiliary_variables: Vec::new(),
existing_repeats: HashMap::new(),
};
for mut variable in grammar.variables.iter_mut() {
expander.expand_variable(&mut variable);
}
grammar
.variables
.extend(expander.auxiliary_variables.into_iter());
grammar
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
// Repeats nested inside of sequences and choices are expanded.
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![
Rule::repeat(Rule::terminal(11)),
Rule::repeat(Rule::terminal(12)),
]),
Rule::terminal(13),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]),
Rule::terminal(13),
])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(11),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(12),
])
),
]
);
}
#[test]
fn test_repeat_deduplication() {
// Terminal 4 appears inside of a repeat in three different places.
let grammar = expand_repeats(build_grammar(vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]),
Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]),
]),
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]),
),
]));
// Only one auxiliary rule is created for repeating terminal 4.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]),
Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]),
])
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(4),
])
)
]
);
}
#[test]
fn test_expansion_of_nested_repeats() {
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::repeat(Rule::seq(vec![
Rule::terminal(11),
Rule::repeat(Rule::terminal(12)),
])),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(12),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]),
])
),
]
);
}
fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
ExtractedSyntaxGrammar {
variables,
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,611 @@
use super::ExtractedLexicalGrammar;
use crate::error::{Error, Result};
use crate::grammars::{LexicalGrammar, LexicalVariable};
use crate::nfa::{CharacterSet, Nfa, NfaState};
use crate::rules::Rule;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
};
use std::i32;
struct NfaBuilder {
nfa: Nfa,
is_sep: bool,
precedence_stack: Vec<i32>,
}
fn get_implicit_precedence(rule: &Rule) -> i32 {
match rule {
Rule::String(_) => 1,
Rule::Metadata { rule, params } => {
if params.is_main_token {
get_implicit_precedence(rule) + 2
} else {
get_implicit_precedence(rule)
}
}
_ => 0,
}
}
fn get_completion_precedence(rule: &Rule) -> i32 {
match rule {
Rule::Metadata { params, .. } => params.precedence.unwrap_or(0),
_ => 0,
}
}
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder {
nfa: Nfa::new(),
is_sep: true,
precedence_stack: vec![0],
};
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
builder.is_sep = false;
builder.nfa.states.push(NfaState::Accept {
variable_index: i,
precedence: get_completion_precedence(&variable.rule),
});
let last_state_id = builder.nfa.last_state_id();
builder
.expand_rule(&variable.rule, last_state_id)
.map_err(|Error(msg)| Error(format!("Rule {} {}", variable.name, msg)))?;
if !is_immediate_token {
builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id();
builder.expand_rule(&separator_rule, last_state_id)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
implicit_precedence: get_implicit_precedence(&variable.rule),
start_state: builder.nfa.last_state_id(),
});
}
Ok(LexicalGrammar {
nfa: builder.nfa,
variables,
})
}
impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error(e.to_string()))?;
self.expand_regex(&ast, next_state_id)
}
Rule::String(s) => {
for c in s.chars().rev() {
self.push_advance(CharacterSet::empty().add_char(c), next_state_id);
next_state_id = self.nfa.last_state_id();
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if self.expand_rule(element, next_state_id)? {
result = true;
}
next_state_id = self.nfa.last_state_id();
}
Ok(result)
}
Rule::Repeat(rule) => {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_rule(rule, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Metadata { rule, params } => {
if let Some(precedence) = params.precedence {
self.precedence_stack.push(precedence);
}
let result = self.expand_rule(rule, next_state_id);
if params.precedence.is_some() {
self.precedence_stack.pop();
}
result
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id);
Ok(true)
}
Ast::Dot(_) => {
self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id);
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(_) => {
Err(Error::regex("Unicode character classes are not supported"))
}
Class::Perl(class) => {
let mut chars = self.expand_perl_character_class(&class.kind);
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let mut chars = self.expand_character_class(&item)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
)),
},
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
self.expand_count(&repetition.ast, min, next_state_id)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
for _ in min..max {
if result {
next_state_id = self.nfa.last_state_id();
}
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
result = true;
}
}
Ok(result)
}
},
Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if self.expand_regex(&ast, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(&ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_regex(&ast, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
self.nfa.states.pop();
Ok(false)
}
}
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_regex(ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if self.expand_regex(ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
}
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(&item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
_ => Err(Error::regex(&format!(
"Unsupported character class syntax {:?}",
item
))),
}
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
}
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap();
self.nfa.states.push(NfaState::Advance {
chars,
state_id,
precedence,
is_sep: self.is_sep,
});
}
fn push_split(&mut self, state_id: u32) {
let last_state_id = self.nfa.last_state_id();
self.nfa
.states
.push(NfaState::Split(state_id, last_state_id));
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::Variable;
use crate::nfa::{NfaCursor, NfaTransition};
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
let mut result = None;
let mut result_precedence = i32::MIN;
let mut start_char = 0;
let mut end_char = 0;
for c in s.chars() {
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
if let Some(NfaTransition {
states,
is_separator,
..
}) = cursor
.transitions()
.into_iter()
.find(|t| t.characters.contains(c) && t.precedence >= result_precedence)
{
cursor.reset(states);
end_char += 1;
if is_separator {
start_char = end_char;
}
} else {
break;
}
}
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
result
}
#[test]
fn test_rule_expansion() {
struct Row {
rules: Vec<Rule>,
separators: Vec<Rule>,
examples: Vec<(&'static str, Option<(usize, &'static str)>)>,
}
let table = [
// regex with sequences and alternatives
Row {
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
separators: vec![],
examples: vec![
("ade1", Some((0, "ade"))),
("bdf1", Some((0, "bdf"))),
("bdfh1", Some((0, "bdfh"))),
("ad1", None),
],
},
// regex with repeats
Row {
rules: vec![Rule::pattern("a*")],
separators: vec![],
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
},
// regex with repeats in sequences
Row {
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
separators: vec![],
examples: vec![
("af1", Some((0, "af"))),
("adedef1", Some((0, "adedef"))),
("abcbcbcf1", Some((0, "abcbcbcf"))),
("a", None),
],
},
// regex with character ranges
Row {
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
separators: vec![],
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
},
// regex with perl character classes
Row {
rules: vec![Rule::pattern("\\w\\d\\s")],
separators: vec![],
examples: vec![("_0 ", Some((0, "_0 ")))],
},
// string
Row {
rules: vec![Rule::string("abc")],
separators: vec![],
examples: vec![("abcd", Some((0, "abc"))), ("ab", None)],
},
// complex rule containing strings and regexes
Row {
rules: vec![Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
]))],
separators: vec![],
examples: vec![
("{a}{", Some((0, "{a}"))),
("{a}{d", Some((0, "{a}"))),
("ab", None),
],
},
// longest match rule
Row {
rules: vec![
Rule::pattern("a|bc"),
Rule::pattern("aa"),
Rule::pattern("bcd"),
],
separators: vec![],
examples: vec![
("a.", Some((0, "a"))),
("bc.", Some((0, "bc"))),
("aa.", Some((1, "aa"))),
("bcd?", Some((2, "bcd"))),
("b.", None),
("c.", None),
],
},
// regex with an alternative including the empty string
Row {
rules: vec![Rule::pattern("a(b|)+c")],
separators: vec![],
examples: vec![
("ac.", Some((0, "ac"))),
("abc.", Some((0, "abc"))),
("abbc.", Some((0, "abbc"))),
],
},
// separators
Row {
rules: vec![Rule::pattern("[a-f]+")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
(" a", Some((0, "a"))),
(" \nb", Some((0, "b"))),
(" \\a", None),
(" \\\na", Some((0, "a"))),
],
},
// shorter tokens with higher precedence
Row {
rules: vec![
Rule::prec(2, Rule::pattern("abc")),
Rule::prec(1, Rule::pattern("ab[cd]e")),
Rule::pattern("[a-e]+"),
],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
("abceef", Some((0, "abc"))),
("abdeef", Some((1, "abde"))),
("aeeeef", Some((2, "aeeee"))),
],
},
// immediate tokens with higher precedence
Row {
rules: vec![
Rule::prec(1, Rule::pattern("[^a]+")),
Rule::immediate_token(Rule::prec(2, Rule::pattern("[^ab]+"))),
],
separators: vec![Rule::pattern("\\s")],
examples: vec![("cccb", Some((1, "ccc")))],
},
Row {
rules: vec![Rule::seq(vec![
Rule::string("a"),
Rule::choice(vec![Rule::string("b"), Rule::string("c")]),
Rule::string("d"),
])],
separators: vec![],
examples: vec![
("abd", Some((0, "abd"))),
("acd", Some((0, "acd"))),
("abc", None),
("ad", None),
("d", None),
("a", None),
],
},
// nested choices within sequences
Row {
rules: vec![Rule::seq(vec![
Rule::pattern("[0-9]+"),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::seq(vec![
Rule::choice(vec![Rule::string("e"), Rule::string("E")]),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::string("+"), Rule::string("-")]),
]),
Rule::pattern("[0-9]+"),
])]),
]),
])],
separators: vec![],
examples: vec![
("12", Some((0, "12"))),
("12e", Some((0, "12"))),
("12g", Some((0, "12"))),
("12e3", Some((0, "12e3"))),
("12e+", Some((0, "12"))),
("12E+34 +", Some((0, "12E+34"))),
("12e34", Some((0, "12e34"))),
],
},
];
for Row {
rules,
separators,
examples,
} in &table
{
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: separators.clone(),
variables: rules
.into_iter()
.map(|rule| Variable::named("", rule.clone()))
.collect(),
})
.unwrap();
for (haystack, needle) in examples.iter() {
assert_eq!(simulate_nfa(&grammar, haystack), *needle);
}
}
}
}

View file

@ -0,0 +1,199 @@
use crate::rules::{Alias, AliasMap, Symbol, SymbolType};
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
#[derive(Clone, Default)]
struct SymbolStatus {
alias: Option<Alias>,
conflicting: bool,
}
pub(super) fn extract_simple_aliases(
syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar
) -> AliasMap {
// Determine which symbols in the grammars are *always* aliased to a single name.
let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()];
let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
for variable in syntax_grammar.variables.iter() {
for production in variable.productions.iter() {
for step in production.steps.iter() {
let mut status = match step.symbol {
Symbol { kind: SymbolType::External, index} => &mut external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if step.alias.is_none() {
status.alias = None;
status.conflicting = true;
}
if !status.conflicting {
if status.alias.is_none() {
status.alias = step.alias.clone();
} else if status.alias != step.alias {
status.alias = None;
status.conflicting = true;
}
}
}
}
}
// Remove the aliases for those symbols.
for variable in syntax_grammar.variables.iter_mut() {
for production in variable.productions.iter_mut() {
for step in production.steps.iter_mut() {
let status = match step.symbol {
Symbol { kind: SymbolType::External, index} => &external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if status.alias.is_some() {
step.alias = None;
}
}
}
}
// Populate a map of the symbols to their aliases.
let mut result = AliasMap::new();
for (i, status) in terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::terminal(i), alias);
}
}
for (i, status) in non_terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::non_terminal(i), alias);
}
}
for (i, status) in external_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::external(i), alias);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep};
use crate::nfa::Nfa;
#[test]
fn test_extract_simple_aliases() {
let mut syntax_grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
},
],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
// Token 0 is always aliased as "a1".
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
// Token 1 is aliased above, but not here.
ProductionStep::new(Symbol::terminal(1)),
// Token 2 is aliased differently than above.
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
},
],
},
],
extra_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
}
],
};
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
assert_eq!(simple_aliases.len(), 1);
assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias {
value: "a1".to_string(),
is_named: true,
});
assert_eq!(syntax_grammar.variables, vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
// 'Simple' alias removed
ProductionStep::new(Symbol::terminal(0)),
// Other aliases unchanged
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
},
],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)),
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
},
],
},
]);
}
}

View file

@ -0,0 +1,525 @@
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::error::{Error, Result};
use crate::grammars::{ExternalToken, Variable, VariableType};
use crate::rules::{MetadataParams, Rule, Symbol, SymbolType};
use hashbrown::HashMap;
use std::mem;
pub(super) fn extract_tokens(
mut grammar: InternedGrammar,
) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> {
let mut extractor = TokenExtractor {
current_variable_name: String::new(),
current_variable_token_count: 0,
extracted_variables: Vec::new(),
extracted_usage_counts: Vec::new(),
};
for mut variable in grammar.variables.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
for mut variable in grammar.external_tokens.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len());
for variable in extractor.extracted_variables {
lexical_variables.push(Variable {
name: variable.name,
kind: variable.kind,
rule: variable.rule,
});
}
// If a variable's entire rule was extracted as a token and that token didn't
// appear within any other rule, then remove that variable from the syntax
// grammar, giving its name to the token in the lexical grammar. Any symbols
// that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented.
let mut variables = Vec::new();
let mut symbol_replacer = SymbolReplacer {
replacements: HashMap::new(),
};
for (i, variable) in grammar.variables.into_iter().enumerate() {
if let Rule::Symbol(Symbol {
kind: SymbolType::Terminal,
index,
}) = variable.rule
{
if i > 0 && extractor.extracted_usage_counts[index] == 1 {
let mut lexical_variable = &mut lexical_variables[index];
lexical_variable.kind = variable.kind;
lexical_variable.name = variable.name;
symbol_replacer.replacements.insert(i, index);
continue;
}
}
variables.push(variable);
}
for variable in variables.iter_mut() {
variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule);
}
let expected_conflicts = grammar
.expected_conflicts
.into_iter()
.map(|conflict| {
let mut result: Vec<_> = conflict
.iter()
.map(|symbol| symbol_replacer.replace_symbol(*symbol))
.collect();
result.sort_unstable();
result.dedup();
result
})
.collect();
let variables_to_inline = grammar
.variables_to_inline
.into_iter()
.map(|symbol| symbol_replacer.replace_symbol(symbol))
.collect();
let mut separators = Vec::new();
let mut extra_tokens = Vec::new();
for rule in grammar.extra_tokens {
if let Rule::Symbol(symbol) = rule {
let new_symbol = symbol_replacer.replace_symbol(symbol);
if new_symbol.is_non_terminal() {
return Err(Error(format!(
"Non-token symbol '{}' cannot be used as an extra token",
&variables[new_symbol.index].name
)));
} else {
extra_tokens.push(new_symbol);
}
} else {
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
extra_tokens.push(Symbol::terminal(index));
} else {
separators.push(rule);
}
}
}
let mut external_tokens = Vec::new();
for external_token in grammar.external_tokens {
let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule);
if let Rule::Symbol(symbol) = rule {
if symbol.is_non_terminal() {
return Err(Error(format!(
"Rule '{}' cannot be used as both an external token and a non-terminal rule",
&variables[symbol.index].name,
)));
}
if symbol.is_external() {
external_tokens.push(ExternalToken {
name: external_token.name,
kind: external_token.kind,
corresponding_internal_token: None,
})
} else {
external_tokens.push(ExternalToken {
name: lexical_variables[symbol.index].name.clone(),
kind: external_token.kind,
corresponding_internal_token: Some(symbol),
})
}
} else {
return Err(Error(format!(
"Non-symbol rules cannot be used as external tokens"
)));
}
}
let mut word_token = None;
if let Some(token) = grammar.word_token {
let token = symbol_replacer.replace_symbol(token);
if token.is_non_terminal() {
return Err(Error(format!(
"Non-terminal symbol '{}' cannot be used as the word token",
&variables[token.index].name
)));
}
word_token = Some(token);
}
Ok((
ExtractedSyntaxGrammar {
variables,
expected_conflicts,
extra_tokens,
variables_to_inline,
external_tokens,
word_token,
},
ExtractedLexicalGrammar {
variables: lexical_variables,
separators,
},
))
}
struct TokenExtractor {
current_variable_name: String,
current_variable_token_count: usize,
extracted_variables: Vec<Variable>,
extracted_usage_counts: Vec<usize>,
}
struct SymbolReplacer {
replacements: HashMap<usize, usize>,
}
impl TokenExtractor {
fn extract_tokens_in_variable(&mut self, variable: &mut Variable) {
self.current_variable_name.clear();
self.current_variable_name.push_str(&variable.name);
self.current_variable_token_count = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.extract_tokens_in_rule(&rule);
}
fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule {
match input {
Rule::String(name) => self.extract_token(input, Some(name)).into(),
Rule::Pattern(..) => self.extract_token(input, None).into(),
Rule::Metadata { params, rule } => {
if params.is_token {
let mut params = params.clone();
params.is_token = false;
let mut string_value = None;
if let Rule::String(value) = rule.as_ref() {
string_value = Some(value);
}
let rule_to_extract = if params == MetadataParams::default() {
rule.as_ref()
} else {
input
};
self.extract_token(rule_to_extract, string_value).into()
} else {
Rule::Metadata {
params: params.clone(),
rule: Box::new(self.extract_tokens_in_rule((&rule).clone())),
}
}
}
Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
_ => input.clone(),
}
}
fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol {
for (i, variable) in self.extracted_variables.iter_mut().enumerate() {
if variable.rule == *rule {
self.extracted_usage_counts[i] += 1;
return Symbol::terminal(i);
}
}
let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value {
Variable {
name: string_value.clone(),
kind: VariableType::Anonymous,
rule: rule.clone()
}
} else {
self.current_variable_token_count += 1;
Variable {
name: format!(
"{}_token{}",
&self.current_variable_name, self.current_variable_token_count
),
kind: VariableType::Auxiliary,
rule: rule.clone(),
}
};
self.extracted_variables.push(variable);
self.extracted_usage_counts.push(1);
Symbol::terminal(index)
}
}
impl SymbolReplacer {
fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule {
match rule {
Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))),
Rule::Metadata { rule, params } => Rule::Metadata {
params: params.clone(),
rule: Box::new(self.replace_symbols_in_rule(rule)),
},
_ => rule.clone(),
}
}
fn replace_symbol(&self, symbol: Symbol) -> Symbol {
if !symbol.is_non_terminal() {
return symbol;
}
if let Some(replacement) = self.replacements.get(&symbol.index) {
return Symbol::terminal(*replacement);
}
let mut adjusted_index = symbol.index;
for (replaced_index, _) in self.replacements.iter() {
if *replaced_index < symbol.index {
adjusted_index -= 1;
}
}
return Symbol::non_terminal(adjusted_index);
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::grammars::VariableType;
#[test]
fn test_extraction() {
let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
Rule::string("a"),
Rule::pattern("b"),
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::token(Rule::repeat(Rule::choice(vec![
Rule::string("c"),
Rule::string("d"),
]))),
]),
])),
),
Variable::named("rule_1", Rule::pattern("e")),
Variable::named("rule_2", Rule::pattern("b")),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]),
),
]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
// The string "a" was replaced by a symbol referencing the lexical grammar
Rule::terminal(0),
// The pattern "b" was replaced by a symbol referencing the lexical grammar
Rule::terminal(1),
Rule::choice(vec![
// The symbol referencing `rule_1` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(3),
// The symbol referencing `rule_2` had its index decremented because
// `rule_1` was moved to the lexical grammar.
Rule::non_terminal(1),
// The rule wrapped in `token` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(2),
])
]))
),
// The pattern "e" was only used in once place: as the definition of `rule_1`,
// so that rule was moved to the lexical grammar. The pattern "b" appeared in
// two places, so it was not moved into the lexical grammar.
Variable::named("rule_2", Rule::terminal(1)),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,])
),
]
);
assert_eq!(
lexical_grammar.variables,
vec![
Variable::anonymous("a", Rule::string("a")),
Variable::auxiliary("rule_0_token1", Rule::pattern("b")),
Variable::auxiliary(
"rule_0_token2",
Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),]))
),
Variable::named("rule_1", Rule::pattern("e")),
]
);
}
#[test]
fn test_start_rule_is_token() {
let (syntax_grammar, lexical_grammar) =
extract_tokens(build_grammar(vec![Variable::named(
"rule_0",
Rule::string("hello"),
)]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![Variable::named("rule_0", Rule::terminal(0)),]
);
assert_eq!(
lexical_grammar.variables,
vec![Variable::anonymous("hello", Rule::string("hello")),]
)
}
#[test]
fn test_extracting_extra_tokens() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::string("x")),
Variable::named("comment", Rule::pattern("//.*")),
]);
grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
}
#[test]
fn test_extract_externals() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![
Rule::external(0),
Rule::string("a"),
Rule::non_terminal(1),
Rule::non_terminal(2),
]),
),
Variable::named("rule_1", Rule::string("b")),
Variable::named("rule_2", Rule::string("c")),
]);
grammar.external_tokens = vec![
Variable::named("external_0", Rule::external(0)),
Variable::anonymous("a", Rule::string("a")),
Variable::named("rule_2", Rule::non_terminal(2)),
];
let (syntax_grammar, _) = extract_tokens(grammar).unwrap();
assert_eq!(
syntax_grammar.external_tokens,
vec![
ExternalToken {
name: "external_0".to_string(),
kind: VariableType::Named,
corresponding_internal_token: None,
},
ExternalToken {
name: "a".to_string(),
kind: VariableType::Anonymous,
corresponding_internal_token: Some(Symbol::terminal(0)),
},
ExternalToken {
name: "rule_2".to_string(),
kind: VariableType::Named,
corresponding_internal_token: Some(Symbol::terminal(2)),
},
]
);
}
#[test]
fn test_error_on_non_terminal_symbol_extras() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::named("rule_1", Rule::non_terminal(2)),
Variable::named("rule_2", Rule::string("x")),
]);
grammar.extra_tokens = vec![Rule::non_terminal(1)];
match extract_tokens(grammar) {
Err(Error(s)) => {
assert_eq!(
s,
"Non-token symbol 'rule_1' cannot be used as an extra token"
);
}
_ => {
panic!("Expected an error but got no error");
}
}
}
#[test]
fn test_error_on_external_with_same_name_as_non_terminal() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
),
Variable::named(
"rule_1",
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]),
),
Variable::named("rule_2", Rule::string("a")),
]);
grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))];
match extract_tokens(grammar) {
Err(Error(s)) => {
assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule");
}
_ => {
panic!("Expected an error but got no error");
}
}
}
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
InternedGrammar {
variables,
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,313 @@
use super::ExtractedSyntaxGrammar;
use crate::error::Result;
use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable};
use crate::rules::{Alias, Associativity, Rule};
struct RuleFlattener {
production: Production,
precedence_stack: Vec<i32>,
associativity_stack: Vec<Associativity>,
alias_stack: Vec<Alias>,
}
impl RuleFlattener {
fn new() -> Self {
Self {
production: Production {
steps: Vec::new(),
dynamic_precedence: 0,
},
precedence_stack: Vec::new(),
associativity_stack: Vec::new(),
alias_stack: Vec::new(),
}
}
fn flatten(mut self, rule: Rule) -> Production {
self.apply(rule, true);
self.production
}
fn apply(&mut self, rule: Rule, at_end: bool) {
match rule {
Rule::Seq(members) => {
let last_index = members.len() - 1;
for (i, member) in members.into_iter().enumerate() {
self.apply(member, i == last_index && at_end);
}
}
Rule::Metadata { rule, params } => {
let mut has_precedence = false;
if let Some(precedence) = params.precedence {
has_precedence = true;
self.precedence_stack.push(precedence);
}
let mut has_associativity = false;
if let Some(associativity) = params.associativity {
has_associativity = true;
self.associativity_stack.push(associativity);
}
let mut has_alias = false;
if let Some(alias) = params.alias {
has_alias = true;
self.alias_stack.push(alias);
}
if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() {
self.production.dynamic_precedence = params.dynamic_precedence;
}
self.apply(*rule, at_end);
if has_precedence {
self.precedence_stack.pop();
if !at_end {
self.production.steps.last_mut().unwrap().precedence =
self.precedence_stack.last().cloned().unwrap_or(0);
}
}
if has_associativity {
self.associativity_stack.pop();
if !at_end {
self.production.steps.last_mut().unwrap().associativity =
self.associativity_stack.last().cloned();
}
}
if has_alias {
self.alias_stack.pop();
}
}
Rule::Symbol(symbol) => {
self.production.steps.push(ProductionStep {
symbol,
precedence: self.precedence_stack.last().cloned().unwrap_or(0),
associativity: self.associativity_stack.last().cloned(),
alias: self.alias_stack.last().cloned(),
});
}
_ => (),
}
}
}
fn extract_choices(rule: Rule) -> Vec<Rule> {
match rule {
Rule::Seq(elements) => {
let mut result = vec![Rule::Blank];
for element in elements {
let extraction = extract_choices(element);
let mut next_result = Vec::new();
for entry in result {
for extraction_entry in extraction.iter() {
next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()]));
}
}
result = next_result;
}
result
}
Rule::Choice(elements) => {
let mut result = Vec::new();
for element in elements {
for rule in extract_choices(element) {
result.push(rule);
}
}
result
}
Rule::Metadata { rule, params } => extract_choices(*rule)
.into_iter()
.map(|rule| Rule::Metadata {
rule: Box::new(rule),
params: params.clone(),
})
.collect(),
_ => vec![rule],
}
}
fn flatten_variable(variable: Variable) -> Result<SyntaxVariable> {
let mut productions = Vec::new();
for rule in extract_choices(variable.rule) {
let production = RuleFlattener::new().flatten(rule);
if !productions.contains(&production) {
productions.push(production);
}
}
Ok(SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
})
}
pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
let mut variables = Vec::new();
for variable in grammar.variables {
variables.push(flatten_variable(variable)?);
}
Ok(SyntaxGrammar {
extra_tokens: grammar.extra_tokens,
expected_conflicts: grammar.expected_conflicts,
variables_to_inline: grammar.variables_to_inline,
external_tokens: grammar.external_tokens,
word_token: grammar.word_token,
variables,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::VariableType;
use crate::rules::Symbol;
#[test]
fn test_flatten_grammar() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_left(
101,
Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_right(
102,
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]),
),
Rule::non_terminal(7),
]),
})
.unwrap();
assert_eq!(
result.productions,
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(3))
.with_prec(102, Some(Associativity::Right)),
ProductionStep::new(Symbol::non_terminal(4))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(5))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
]
);
}
#[test]
fn test_flatten_grammar_with_maximum_dynamic_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_dynamic(101, Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_dynamic(102, Rule::seq(vec![
Rule::non_terminal(3),
Rule::non_terminal(4)
])),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
])),
Rule::non_terminal(7),
])
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 102,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(3)),
ProductionStep::new(Symbol::non_terminal(4)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
Production {
dynamic_precedence: 101,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(5)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
]);
}
#[test]
fn test_flatten_grammar_with_final_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(101, Rule::seq(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
])),
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)),
]
}
]);
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(101, Rule::seq(vec![
Rule::non_terminal(1),
])),
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)),
]
}
]);
}
}

View file

@ -0,0 +1,238 @@
use super::InternedGrammar;
use crate::error::{Error, Result};
use crate::grammars::{InputGrammar, Variable, VariableType};
use crate::rules::{Rule, Symbol};
pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar> {
let interner = Interner { grammar };
if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden {
return Err(Error(
"Grammar's start rule must be visible".to_string(),
));
}
let mut variables = Vec::with_capacity(grammar.variables.len());
for variable in grammar.variables.iter() {
variables.push(Variable {
name: variable.name.clone(),
kind: variable_type_for_name(&variable.name),
rule: interner.intern_rule(&variable.rule)?,
});
}
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len());
for external_token in grammar.external_tokens.iter() {
let rule = interner.intern_rule(&external_token)?;
let (name, kind) = if let Rule::NamedSymbol(name) = external_token {
(name.clone(), variable_type_for_name(&name))
} else {
(String::new(), VariableType::Anonymous)
};
external_tokens.push(Variable { name, kind, rule });
}
let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
for extra_token in grammar.extra_tokens.iter() {
extra_tokens.push(interner.intern_rule(extra_token)?);
}
let mut expected_conflicts = Vec::new();
for conflict in grammar.expected_conflicts.iter() {
let mut interned_conflict = Vec::with_capacity(conflict.len());
for name in conflict {
interned_conflict.push(
interner
.intern_name(&name)
.ok_or_else(|| Error::undefined_symbol(name))?,
);
}
expected_conflicts.push(interned_conflict);
}
let mut variables_to_inline = Vec::new();
for name in grammar.variables_to_inline.iter() {
if let Some(symbol) = interner.intern_name(&name) {
variables_to_inline.push(symbol);
}
}
let mut word_token = None;
if let Some(name) = grammar.word_token.as_ref() {
word_token = Some(
interner
.intern_name(&name)
.ok_or_else(|| Error::undefined_symbol(&name))?,
);
}
Ok(InternedGrammar {
variables,
external_tokens,
extra_tokens,
expected_conflicts,
variables_to_inline,
word_token,
})
}
struct Interner<'a> {
grammar: &'a InputGrammar,
}
impl<'a> Interner<'a> {
fn intern_rule(&self, rule: &Rule) -> Result<Rule> {
match rule {
Rule::Choice(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Choice(result))
}
Rule::Seq(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Seq(result))
}
Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))),
Rule::Metadata { rule, params } => Ok(Rule::Metadata {
rule: Box::new(self.intern_rule(rule)?),
params: params.clone(),
}),
Rule::NamedSymbol(name) => {
if let Some(symbol) = self.intern_name(&name) {
Ok(Rule::Symbol(symbol))
} else {
Err(Error::undefined_symbol(name))
}
}
_ => Ok(rule.clone()),
}
}
fn intern_name(&self, symbol: &str) -> Option<Symbol> {
for (i, variable) in self.grammar.variables.iter().enumerate() {
if variable.name == symbol {
return Some(Symbol::non_terminal(i));
}
}
for (i, external_token) in self.grammar.external_tokens.iter().enumerate() {
if let Rule::NamedSymbol(name) = external_token {
if name == symbol {
return Some(Symbol::external(i));
}
}
}
return None;
}
}
fn variable_type_for_name(name: &str) -> VariableType {
if name.starts_with("_") {
VariableType::Hidden
} else {
VariableType::Named
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
let grammar = intern_symbols(&build_grammar(vec![
Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])),
Variable::named("y", Rule::named("_z")),
Variable::named("_z", Rule::string("a")),
]))
.unwrap();
assert_eq!(
grammar.variables,
vec![
Variable::named(
"x",
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),])
),
Variable::named("y", Rule::non_terminal(2)),
Variable::hidden("_z", Rule::string("a")),
]
);
}
#[test]
fn test_interning_external_token_names() {
// Variable `y` is both an internal and an external token.
// Variable `z` is just an external token.
let mut input_grammar = build_grammar(vec![
Variable::named(
"w",
Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]),
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]);
input_grammar
.external_tokens
.extend(vec![Rule::named("y"), Rule::named("z")]);
let grammar = intern_symbols(&input_grammar).unwrap();
// Variable `y` is referred to by its internal index.
// Variable `z` is referred to by its external index.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"w",
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::external(1),
])
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]
);
// The external token for `y` refers back to its internal index.
assert_eq!(
grammar.external_tokens,
vec![
Variable::named("y", Rule::non_terminal(2)),
Variable::named("z", Rule::external(1)),
]
);
}
#[test]
fn test_grammar_with_undefined_symbols() {
let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))]));
match result {
Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"),
_ => panic!("Expected an error but got none"),
}
}
fn build_grammar(variables: Vec<Variable>) -> InputGrammar {
InputGrammar {
variables,
name: "the_language".to_string(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,57 @@
mod expand_repeats;
mod expand_tokens;
mod extract_simple_aliases;
mod extract_tokens;
mod flatten_grammar;
mod intern_symbols;
mod process_inlines;
use self::expand_repeats::expand_repeats;
pub(crate) use self::expand_tokens::expand_tokens;
use self::extract_simple_aliases::extract_simple_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
use self::intern_symbols::intern_symbols;
use self::process_inlines::process_inlines;
use crate::error::Result;
use crate::grammars::{
ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable,
};
use crate::rules::{AliasMap, Rule, Symbol};
pub(crate) struct IntermediateGrammar<T, U> {
variables: Vec<Variable>,
extra_tokens: Vec<T>,
expected_conflicts: Vec<Vec<Symbol>>,
external_tokens: Vec<U>,
variables_to_inline: Vec<Symbol>,
word_token: Option<Symbol>,
}
pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ExtractedLexicalGrammar {
pub variables: Vec<Variable>,
pub separators: Vec<Rule>,
}
pub(crate) fn prepare_grammar(
input_grammar: &InputGrammar,
) -> Result<(
SyntaxGrammar,
LexicalGrammar,
InlinedProductionMap,
AliasMap,
)> {
let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
let syntax_grammar = expand_repeats(syntax_grammar);
let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
let lexical_grammar = expand_tokens(lexical_grammar)?;
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
let inlines = process_inlines(&syntax_grammar);
Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases))
}

View file

@ -0,0 +1,479 @@
use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar};
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct ProductionStepId {
// A `None` value here means that the production itself was produced via inlining,
// and is stored in the the builder's `productions` vector, as opposed to being
// stored in one of the grammar's variables.
variable_index: Option<usize>,
production_index: usize,
step_index: usize,
}
struct InlinedProductionMapBuilder {
production_indices_by_step_id: HashMap<ProductionStepId, Vec<usize>>,
productions: Vec<Production>,
}
impl InlinedProductionMapBuilder {
fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap {
let mut step_ids_to_process = Vec::new();
for (variable_index, variable) in grammar.variables.iter().enumerate() {
for production_index in 0..variable.productions.len() {
step_ids_to_process.push(ProductionStepId {
variable_index: Some(variable_index),
production_index,
step_index: 0,
});
while !step_ids_to_process.is_empty() {
let mut i = 0;
while i < step_ids_to_process.len() {
let step_id = step_ids_to_process[i];
if let Some(step) = self.production_step_for_id(step_id, grammar) {
if grammar.variables_to_inline.contains(&step.symbol) {
let inlined_step_ids = self
.inline_production_at_step(step_id, grammar)
.into_iter()
.cloned()
.map(|production_index| ProductionStepId {
variable_index: None,
production_index,
step_index: step_id.step_index,
});
step_ids_to_process.splice(i..i + 1, inlined_step_ids);
} else {
step_ids_to_process[i] = ProductionStepId {
variable_index: step_id.variable_index,
production_index: step_id.production_index,
step_index: step_id.step_index + 1,
};
i += 1;
}
} else {
step_ids_to_process.remove(i);
}
}
}
}
}
let productions = self.productions;
let production_indices_by_step_id = self.production_indices_by_step_id;
let production_map = production_indices_by_step_id
.into_iter()
.map(|(step_id, production_indices)| {
let production = if let Some(variable_index) = step_id.variable_index {
&grammar.variables[variable_index].productions[step_id.production_index]
} else {
&productions[step_id.production_index]
} as *const Production;
((production, step_id.step_index as u32), production_indices)
})
.collect();
InlinedProductionMap {
productions,
production_map,
}
}
fn inline_production_at_step<'a>(
&'a mut self,
step_id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Vec<usize> {
// Build a list of productions produced by inlining rules.
let mut i = 0;
let step_index = step_id.step_index;
let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()];
while i < productions_to_add.len() {
if let Some(step) = productions_to_add[i].steps.get(step_index) {
let symbol = step.symbol.clone();
if grammar.variables_to_inline.contains(&symbol) {
// Remove the production from the vector, replacing it with a placeholder.
let production = productions_to_add
.splice(i..i + 1, [Production::default()].iter().cloned())
.next()
.unwrap();
// Replace the placeholder with the inlined productions.
productions_to_add.splice(
i..i + 1,
grammar.variables[symbol.index].productions.iter().map(|p| {
let mut production = production.clone();
let removed_step = production
.steps
.splice(step_index..(step_index + 1), p.steps.iter().cloned())
.next()
.unwrap();
let inserted_steps =
&mut production.steps[step_index..(step_index + p.steps.len())];
if let Some(alias) = removed_step.alias {
for inserted_step in inserted_steps.iter_mut() {
inserted_step.alias = Some(alias.clone());
}
}
if let Some(last_inserted_step) = inserted_steps.last_mut() {
if last_inserted_step.precedence == 0 {
last_inserted_step.precedence = removed_step.precedence;
}
if last_inserted_step.associativity == None {
last_inserted_step.associativity = removed_step.associativity;
}
}
production
}),
);
continue;
}
}
i += 1;
}
// Store all the computed productions.
let result = productions_to_add
.into_iter()
.map(|production| {
self.productions
.iter()
.position(|p| *p == production)
.unwrap_or({
self.productions.push(production);
self.productions.len() - 1
})
})
.collect();
// Cache these productions based on the original production step.
self.production_indices_by_step_id
.entry(step_id)
.or_insert(result)
}
fn production_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Production {
if let Some(variable_index) = id.variable_index {
&grammar.variables[variable_index].productions[id.production_index]
} else {
&self.productions[id.production_index]
}
}
fn production_step_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> Option<&'a ProductionStep> {
self.production_for_id(id, grammar).steps.get(id.step_index)
}
}
pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap {
InlinedProductionMapBuilder {
productions: Vec::new(),
production_indices_by_step_id: HashMap::new(),
}
.build(grammar)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{ProductionStep, SyntaxVariable, VariableType};
use crate::rules::{Associativity, Symbol};
#[test]
fn test_basic_inlining() {
let grammar = SyntaxGrammar {
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
variables_to_inline: vec![Symbol::non_terminal(1)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(14))],
},
],
},
],
};
let inline_map = process_inlines(&grammar);
// Nothing to inline at step 0.
assert!(inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.is_none());
// Inlining variable 1 yields two productions.
assert_eq!(
inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
],
},
]
);
}
#[test]
fn test_nested_inlining() {
let grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)), // inlined
ProductionStep::new(Symbol::terminal(12)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(3)), // inlined
ProductionStep::new(Symbol::terminal(14)),
],
},
],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(15))],
}],
},
SyntaxVariable {
name: "non-terminal-3".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(16))],
}],
},
],
variables_to_inline: vec![
Symbol::non_terminal(1),
Symbol::non_terminal(2),
Symbol::non_terminal(3),
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let inline_map = process_inlines(&grammar);
let productions: Vec<&Production> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(16)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
]
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::terminal(15)),
ProductionStep::new(Symbol::terminal(12)),
],
},]
);
}
#[test]
fn test_inlining_with_precedence_and_alias() {
let grammar = SyntaxGrammar {
variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// inlined
ProductionStep::new(Symbol::non_terminal(1))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// inlined
ProductionStep::new(Symbol::non_terminal(2))
.with_alias("outer_alias", true),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12)).with_prec(3, None),
],
}],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
}],
},
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let inline_map = process_inlines(&grammar);
let productions: Vec<_> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
// The first step in the inlined production retains its precedence
// and alias.
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
// The final step of the inlined production inherits the precedence of
// the inlined step.
ProductionStep::new(Symbol::terminal(12))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true),
]
}],
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// All steps of the inlined production inherit their alias from the
// inlined step.
ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true),
]
}],
);
}
}

1034
cli/src/render/mod.rs Normal file

File diff suppressed because it is too large Load diff

234
cli/src/rules.rs Normal file
View file

@ -0,0 +1,234 @@
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum SymbolType {
External,
End,
Terminal,
NonTerminal,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum Associativity {
Left,
Right,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Alias {
pub value: String,
pub is_named: bool,
}
pub(crate) type AliasMap = HashMap<Symbol, Alias>;
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
pub(crate) struct MetadataParams {
pub precedence: Option<i32>,
pub dynamic_precedence: i32,
pub associativity: Option<Associativity>,
pub is_token: bool,
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool,
pub alias: Option<Alias>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Symbol {
pub kind: SymbolType,
pub index: usize,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum Rule {
Blank,
String(String),
Pattern(String),
NamedSymbol(String),
Symbol(Symbol),
Choice(Vec<Rule>),
Metadata {
params: MetadataParams,
rule: Box<Rule>,
},
Repeat(Box<Rule>),
Seq(Vec<Rule>),
}
impl Rule {
pub fn alias(content: Rule, value: String, is_named: bool) -> Self {
add_metadata(content, move |params| {
params.alias = Some(Alias {
is_named,
value
});
})
}
pub fn token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
})
}
pub fn immediate_token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
params.is_main_token = true;
})
}
pub fn prec(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.precedence = Some(value);
})
}
pub fn prec_left(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Left);
params.precedence = Some(value);
})
}
pub fn prec_right(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Right);
params.precedence = Some(value);
})
}
pub fn prec_dynamic(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.dynamic_precedence = value;
})
}
pub fn repeat(rule: Rule) -> Self {
Rule::Repeat(Box::new(rule))
}
pub fn choice(rules: Vec<Rule>) -> Self {
let mut elements = Vec::with_capacity(rules.len());
for rule in rules {
choice_helper(&mut elements, rule);
}
Rule::Choice(elements)
}
pub fn seq(rules: Vec<Rule>) -> Self {
Rule::Seq(rules)
}
}
#[cfg(test)]
impl Rule {
pub fn terminal(index: usize) -> Self {
Rule::Symbol(Symbol::terminal(index))
}
pub fn non_terminal(index: usize) -> Self {
Rule::Symbol(Symbol::non_terminal(index))
}
pub fn external(index: usize) -> Self {
Rule::Symbol(Symbol::external(index))
}
pub fn named(name: &'static str) -> Self {
Rule::NamedSymbol(name.to_string())
}
pub fn string(value: &'static str) -> Self {
Rule::String(value.to_string())
}
pub fn pattern(value: &'static str) -> Self {
Rule::Pattern(value.to_string())
}
}
impl Symbol {
pub fn is_terminal(&self) -> bool {
self.kind == SymbolType::Terminal
}
pub fn is_non_terminal(&self) -> bool {
self.kind == SymbolType::NonTerminal
}
pub fn is_external(&self) -> bool {
self.kind == SymbolType::External
}
pub fn is_eof(&self) -> bool {
self.kind == SymbolType::End
}
pub fn non_terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::NonTerminal,
index,
}
}
pub fn terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::Terminal,
index,
}
}
pub fn external(index: usize) -> Self {
Symbol {
kind: SymbolType::External,
index,
}
}
pub fn end() -> Self {
Symbol {
kind: SymbolType::End,
index: 0,
}
}
}
impl From<Symbol> for Rule {
fn from(symbol: Symbol) -> Self {
Rule::Symbol(symbol)
}
}
fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
match input {
Rule::Metadata { rule, mut params } => {
f(&mut params);
Rule::Metadata { rule, params }
}
_ => {
let mut params = MetadataParams::default();
f(&mut params);
Rule::Metadata {
rule: Box::new(input),
params,
}
}
}
}
fn choice_helper(result: &mut Vec<Rule>, rule: Rule) {
match rule {
Rule::Choice(elements) => {
for element in elements {
choice_helper(result, element);
}
}
_ => {
if !result.contains(&rule) {
result.push(rule);
}
}
}
}

140
cli/src/tables.rs Normal file
View file

@ -0,0 +1,140 @@
use crate::nfa::CharacterSet;
use crate::rules::{Alias, Associativity, Symbol};
use hashbrown::HashMap;
pub(crate) type AliasSequenceId = usize;
pub(crate) type ParseStateId = usize;
pub(crate) type LexStateId = usize;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum ParseAction {
Accept,
Shift {
state: ParseStateId,
is_repetition: bool,
},
ShiftExtra,
Recover,
Reduce {
symbol: Symbol,
child_count: usize,
precedence: i32,
dynamic_precedence: i32,
associativity: Option<Associativity>,
alias_sequence_id: AliasSequenceId,
},
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseTableEntry {
pub actions: Vec<ParseAction>,
pub reusable: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseState {
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub lex_state_id: usize,
pub unfinished_item_signature: u64,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParseTable {
pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>,
pub alias_sequences: Vec<Vec<Option<Alias>>>,
pub max_aliased_production_length: usize,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AdvanceAction {
pub state: Option<LexStateId>,
pub in_main_token: bool,
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct LexState {
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
pub accept_action: Option<Symbol>,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexTable {
pub states: Vec<LexState>,
}
impl ParseTableEntry {
pub fn new() -> Self {
Self {
reusable: true,
actions: Vec::new(),
}
}
}
impl Default for LexTable {
fn default() -> Self {
LexTable { states: Vec::new() }
}
}
impl ParseState {
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
self.terminal_entries
.iter()
.flat_map(|(_, entry)| {
entry.actions.iter().filter_map(|action| match action {
ParseAction::Shift { state, .. } => Some(*state),
_ => None,
})
})
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
where
F: FnMut(usize, &ParseState) -> usize,
{
let mut updates = Vec::new();
for (symbol, entry) in &self.terminal_entries {
for (i, action) in entry.actions.iter().enumerate() {
if let ParseAction::Shift { state, .. } = action {
let result = f(*state, self);
if result != *state {
updates.push((*symbol, i, result));
}
}
}
}
for (symbol, other_state) in &self.nonterminal_entries {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries.insert(symbol, new_state);
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
entry.actions[action_index] = ParseAction::Shift {
state: new_state,
is_repetition,
};
}
}
}
}
}
impl ParseAction {
pub fn precedence(&self) -> i32 {
if let ParseAction::Reduce { precedence, .. } = self {
*precedence
} else {
0
}
}
}