diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c99815eb..9208f602 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,10 +1,10 @@ use super::inline_variables::InlinedProductionMap; -use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; -use crate::rules::{Symbol, SymbolType}; +use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Associativity, Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; +use std::collections::{HashMap, BTreeMap}; use std::fmt; +use std::hash::{Hash, Hasher}; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -28,7 +28,7 @@ pub(crate) struct LookaheadSet { eof: bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum ParseItem { Start { step_index: u32, @@ -47,10 +47,29 @@ pub(crate) enum ParseItem { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet { - pub entries: HashMap, + pub entries: BTreeMap, } -pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); +pub(crate) struct ParseItemDisplay<'a>( + &'a ParseItem, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +pub(crate) struct ParseItemSetDisplay<'a>( + &'a ParseItemSet, + &'a SyntaxGrammar, + &'a LexicalGrammar, + &'a InlinedProductionMap, +); + +struct ParseItemSetMapEntry(ParseItemSet, u64); +pub(crate) struct ParseItemSetMap { + map: HashMap +} impl LookaheadSet { pub fn new() -> Self { @@ -61,12 +80,61 @@ impl LookaheadSet { } } - pub fn insert(&mut self, other: Symbol) { - match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), - SymbolType::Terminal => self.terminal_bits.set(other.index, true), - SymbolType::External => self.external_bits.set(other.index, true), + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + .chain( + self.external_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::external(i)) + } else { + None + } + }), + ) + .chain(if self.eof { Some(Symbol::end()) } else { None }) + } + + pub fn with<'a>(symbols: impl IntoIterator) -> Self { + let mut result = Self::new(); + for symbol in symbols { + result.insert(*symbol); } + result + } + + pub fn contains(&self, symbol: &Symbol) -> bool { + match symbol.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), + SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), + SymbolType::End => self.eof, + } + } + + pub fn insert(&mut self, other: Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = true; + return; + } + }; + vec.resize(other.index + 1, false); + vec.set(other.index, true); } pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { @@ -95,6 +163,14 @@ impl LookaheadSet { } result } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> LookaheadSetDisplay<'a> { + LookaheadSetDisplay(self, syntax_grammar, lexical_grammar) + } } impl ParseItem { @@ -126,18 +202,53 @@ impl ParseItem { &grammar.variables[*variable_index as usize].productions[*production_index as usize] } ParseItem::Inlined { - production_index, - .. + production_index, .. } => &inlined_productions.inlined_productions[*production_index as usize], } } + pub fn symbol( + &self, + grammar: &SyntaxGrammar, + inlined_productions: &InlinedProductionMap, + ) -> Option { + self.step(grammar, inlined_productions).map(|s| s.symbol) + } + pub fn step<'a>( &self, grammar: &'a SyntaxGrammar, inlined_productions: &'a InlinedProductionMap, ) -> Option<&'a ProductionStep> { - self.production(grammar, inlined_productions).steps.get(self.step_index()) + self.production(grammar, inlined_productions) + .steps + .get(self.step_index()) + } + + pub fn precedence<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> i32 { + self.production(grammar, inlines) + .steps + .get(self.step_index() - 1) + .map(|s| s.precedence) + .unwrap_or(0) + } + + pub fn associativity<'a>( + &self, + grammar: &'a SyntaxGrammar, + inlines: &'a InlinedProductionMap, + ) -> Option { + let production = self.production(grammar, inlines); + let step_index = self.step_index(); + if step_index == production.steps.len() { + production.steps.last().and_then(|s| s.associativity) + } else { + None + } } pub fn variable_index(&self) -> u32 { @@ -156,6 +267,14 @@ impl ParseItem { } } + pub fn is_final(&self) -> bool { + if let ParseItem::Start { step_index: 1 } = self { + true + } else { + false + } + } + fn step_index_mut(&mut self) -> &mut u32 { match self { ParseItem::Start { step_index } @@ -164,8 +283,13 @@ impl ParseItem { } } - pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { - ParseItemDisplay(self, grammar, inlines) + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines) } pub fn successor(&self) -> ParseItem { @@ -176,33 +300,107 @@ impl ParseItem { } impl ParseItemSet { - pub fn new() -> Self { - Self { entries: HashMap::new() } + pub fn with<'a>(elements: impl IntoIterator) -> Self { + let mut result = Self::default(); + for (item, lookaheads) in elements { + result.entries.insert(*item, lookaheads.clone()); + } + result + } + + pub fn display_with<'a>( + &'a self, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> ParseItemSetDisplay<'a> { + ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines) + } +} + +impl Default for ParseItemSet { + fn default() -> Self { + Self { + entries: BTreeMap::new(), + } } } impl<'a> fmt::Display for ParseItemDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + if let ParseItem::Start { .. } = &self.0 { + write!(f, "START →")?; + } else { + write!( + f, + "{} →", + &self.1.variables[self.0.variable_index() as usize].name + )?; + } + let step_index = self.0.step_index(); - let production = self.0.production(self.1, self.2); + let production = self.0.production(self.1, self.3); for (i, step) in production.steps.iter().enumerate() { - if i > 0 { - write!(f, " ")?; - } - if i == step_index { - write!(f, "• ")?; + write!(f, " •")?; } - let name = if step.symbol.is_terminal() { - "terminal" + write!(f, " ")?; + if step.symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(step.symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", step.symbol.index)?; + } } else if step.symbol.is_external() { - "external" + write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?; } else { - "non-terminal" - }; + write!(f, "{}", &self.1.variables[step.symbol.index].name)?; + } + } - write!(f, "{}-{}", name, step.symbol.index)?; + if production.steps.len() == step_index { + write!(f, " •")?; + } + + Ok(()) + } +} + +impl<'a> fmt::Display for LookaheadSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "[")?; + for (i, symbol) in self.0.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + + if symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", symbol.index)?; + } + } else if symbol.is_external() { + write!(f, "{}", &self.1.external_tokens[symbol.index].name)?; + } else { + write!(f, "{}", &self.1.variables[symbol.index].name)?; + } + } + write!(f, "]")?; + Ok(()) + } +} + +impl<'a> fmt::Display for ParseItemSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + for (item, lookaheads) in self.0.entries.iter() { + writeln!( + f, + "{}\t{}", + item.display_with(self.1, self.2, self.3), + lookaheads.display_with(self.1, self.2) + )?; } Ok(()) } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 61d45ded..530c1f25 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -20,7 +20,7 @@ pub(crate) struct ParseItemSetBuilder { first_sets: HashMap, last_sets: HashMap, transitive_closure_additions: Vec>, - inlined_production_map: InlinedProductionMap, + pub inlines: InlinedProductionMap, } fn find_or_push(vector: &mut Vec, value: T) { @@ -35,7 +35,7 @@ impl ParseItemSetBuilder { first_sets: HashMap::new(), last_sets: HashMap::new(), transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], - inlined_production_map: InlinedProductionMap::new(syntax_grammar), + inlines: InlinedProductionMap::new(syntax_grammar), }; // For each grammar symbol, populate the FIRST and LAST sets: the set of @@ -192,6 +192,10 @@ impl ParseItemSetBuilder { let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { let variable = &syntax_grammar.variables[variable_index]; + let non_terminal = Symbol::non_terminal(variable_index); + if syntax_grammar.variables_to_inline.contains(&non_terminal) { + continue; + } for production_index in 0..variable.productions.len() { let item = ParseItem::Normal { variable_index: variable_index as u32, @@ -199,7 +203,7 @@ impl ParseItemSetBuilder { step_index: 0, }; - if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) { + if let Some(inlined_items) = result.inlines.inlined_items(item) { for inlined_item in inlined_items { find_or_push( additions_for_non_terminal, @@ -227,32 +231,36 @@ impl ParseItemSetBuilder { pub(crate) fn transitive_closure( &mut self, - item_set: ParseItemSet, + item_set: &ParseItemSet, grammar: &SyntaxGrammar, ) -> ParseItemSet { - let mut result = ParseItemSet::new(); - for (item, lookaheads) in item_set.entries { - if let Some(items) = self.inlined_production_map.inlined_items(item) { + let mut result = ParseItemSet::default(); + for (item, lookaheads) in &item_set.entries { + if let Some(items) = self.inlines.inlined_items(*item) { for item in items { - self.add_item(&mut result, item, lookaheads.clone(), grammar); + self.add_item(&mut result, item, lookaheads, grammar); } } else { - self.add_item(&mut result, item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads, grammar); } } result } + pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + fn add_item( &self, set: &mut ParseItemSet, item: ParseItem, - lookaheads: LookaheadSet, + lookaheads: &LookaheadSet, grammar: &SyntaxGrammar, ) { - if let Some(step) = item.step(grammar, &self.inlined_production_map) { + if let Some(step) = item.step(grammar, &self.inlines) { if step.symbol.is_non_terminal() { - let next_step = item.successor().step(grammar, &self.inlined_production_map); + let next_step = item.successor().step(grammar, &self.inlines); // Determine which tokens can follow this non-terminal. let following_tokens = if let Some(next_step) = next_step { @@ -274,6 +282,6 @@ impl ParseItemSetBuilder { } } } - set.entries.insert(item, lookaheads); + set.entries.insert(item, lookaheads.clone()); } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 01d9219d..091c5486 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,37 +1,611 @@ -mod item; mod inline_variables; mod item; mod item_set_builder; -use std::collections::{HashMap, VecDeque}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable, ParseStateId}; -use crate::rules::{AliasMap, Symbol}; -use crate::error::Result; -use self::item::ParseItemSet; +use self::item::{LookaheadSet, ParseItem, ParseItemSet}; +use self::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; +use crate::tables::ParseTableEntry; +use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable}; +use core::ops::Range; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Write; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; struct ParseStateQueueEntry { preceding_symbols: SymbolSequence, - item_set: ParseItemSet, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, state_id: ParseStateId, } struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, simple_aliases: &'a AliasMap, state_ids_by_item_set: HashMap, - item_sets_by_state_id: Vec<&'a ParseItemSet>, + item_sets_by_state_id: Vec, parse_state_queue: VecDeque, parse_table: ParseTable, } +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { + // Ensure that the empty rename sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Ensure that the error state has index 0. + let error_state_id = self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::default(), + ); + + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]), + ); + + self.process_part_state_queue()?; + self.populate_used_symbols(); + + Err(Error::grammar("oh no")) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet, + ) -> ParseStateId { + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => { + // eprintln!("Item set already processed at state {}", *o.get()); + *o.get() + } + Entry::Vacant(v) => { + // eprintln!("Item set not yet processed"); + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn process_part_state_queue(&mut self) -> Result<()> { + while let Some(entry) = self.parse_state_queue.pop_front() { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id].display_with( + &self.syntax_grammar, + &self.lexical_grammar, + &self.item_set_builder.inlines + ) + ); + + let item_set = self.item_set_builder.transitive_closure( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + ); + + // println!("TRANSITIVE CLOSURE:"); + // for item in item_set.entries.keys() { + // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); + // } + // println!(""); + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + item_set, + entry.state_id, + )?; + } + Ok(()) + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + item_set: ParseItemSet, + state_id: ParseStateId, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = + item.symbol(self.syntax_grammar, &self.item_set_builder.inlines) + { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_final() { + ParseAction::Accept + } else { + let production = + item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index() as usize), + child_count: item.step_index(), + precedence: production.last_precedence(), + associativity: production.last_associativity(), + dynamic_precedence: production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let step_index = item.step_index(); + if let Some(step) = production.steps.get(step_index) { + if step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = production.steps[step_index - 1].precedence; + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index(); + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index() == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| { + item.step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + }); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index() as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, "\n {}:", i).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index()) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index() as usize].name + ) + .unwrap(); + + for (j, step) in item + .production(&self.syntax_grammar, &self.item_set_builder.inlines) + .steps + .iter() + .enumerate() + { + if j == item.step_index() { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item + .step(&self.syntax_grammar, &self.item_set_builder.inlines) + .is_none() + { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines); + let associativity = + item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + } + + // TODO - generate suggested resolutions + + Err(Error::ConflictError(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol) + { + None + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines); + let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect(); + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - simple_aliases: &AliasMap + simple_aliases: &AliasMap, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - unimplemented!(); + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + simple_aliases, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + alias_sequences: Vec::new(), + symbols: Vec::new(), + }, + } + .build() } diff --git a/src/error.rs b/src/error.rs index 49064c22..b03efa93 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,7 @@ pub enum Error { GrammarError(String), SymbolError(String), RegexError(String), + ConflictError(String), } pub type Result = std::result::Result; diff --git a/src/generate.rs b/src/generate.rs index 4507fb6f..dc3d5176 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -4,8 +4,8 @@ use crate::prepare_grammar::prepare_grammar; use crate::build_tables::build_tables; use crate::render::render_c_code; -pub fn generate_parser_for_grammar(input: String) -> Result { - let input_grammar = parse_grammar(&input)?; +pub fn generate_parser_for_grammar(input: &str) -> Result { + let input_grammar = parse_grammar(input)?; let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?; let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( &syntax_grammar, diff --git a/src/grammars.rs b/src/grammars.rs index 8abdad24..7512ec03 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -38,7 +38,7 @@ pub(crate) struct LexicalVariable { pub start_state: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Default, PartialEq, Eq)] pub(crate) struct LexicalGrammar { pub nfa: Nfa, pub variables: Vec, @@ -112,6 +112,14 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } + + pub fn last_precedence(&self) -> i32 { + self.steps.last().map(|s| s.precedence).unwrap_or(0) + } + + pub fn last_associativity(&self) -> Option { + self.steps.last().map(|s| s.associativity).unwrap_or(None) + } } impl Default for Production { @@ -137,3 +145,9 @@ impl Variable { Self { name: name.to_string(), kind: VariableType::Anonymous, rule } } } + +impl SyntaxVariable { + pub fn is_auxiliary(&self) -> bool { + self.kind == VariableType::Auxiliary + } +} diff --git a/src/js/dsl.js b/src/js/dsl.js new file mode 100644 index 00000000..ba3962cd --- /dev/null +++ b/src/js/dsl.js @@ -0,0 +1,334 @@ +const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; +const DELIMITER_ESCAPE_PATTERN = /\\\//g; + +function alias(rule, value) { + const result = { + type: "ALIAS", + content: normalize(rule), + named: false, + value: null + }; + + switch (value.constructor) { + case String: + result.named = false; + result.value = value; + return result; + case ReferenceError: + result.named = true; + result.value = value.symbol.name; + return result; + case Object: + if (typeof value.type === 'string' && value.type === 'SYMBOL') { + result.named = true; + result.value = value.name; + return result; + } + } + + throw new Error('Invalid alias value ' + value); +} + +function blank() { + return { + type: "BLANK" + }; +} + +function choice(...elements) { + return { + type: "CHOICE", + members: elements.map(normalize) + }; +} + +function optional(value) { + return choice(value, blank()); +} + +function prec(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC", + value: number, + content: normalize(rule) + }; +} + +prec.left = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_LEFT", + value: number, + content: normalize(rule) + }; +} + +prec.right = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_RIGHT", + value: number, + content: normalize(rule) + }; +} + +prec.dynamic = function(number, rule) { + return { + type: "PREC_DYNAMIC", + value: number, + content: normalize(rule) + }; +} + +function repeat(rule) { + return { + type: "REPEAT", + content: normalize(rule) + }; +} + +function repeat1(rule) { + return { + type: "REPEAT1", + content: normalize(rule) + }; +} + +function seq(...elements) { + return { + type: "SEQ", + members: elements.map(normalize) + }; +} + +function sym(name) { + return { + type: "SYMBOL", + name: name + }; +} + +function token(value) { + return { + type: "TOKEN", + content: normalize(value) + }; +} + +token.immediate = function(value) { + return { + type: "IMMEDIATE_TOKEN", + content: normalize(value) + }; +} + +function normalize(value) { + + if (typeof value == "undefined") + throw new Error("Undefined symbol"); + + switch (value.constructor) { + case String: + return { + type: 'STRING', + value + }; + case RegExp: + return { + type: 'PATTERN', + value: value.source + .replace( + DELIMITER_ESCAPE_PATTERN, + '/' + ) + .replace( + UNICODE_ESCAPE_PATTERN, + (match, group) => String.fromCharCode(parseInt(group, 16)) + ) + }; + case ReferenceError: + throw value + default: + if (typeof value.type === 'string') { + return value; + } else { + throw new TypeError("Invalid rule: " + value.toString()); + } + } +} + +function RuleBuilder(ruleMap) { + return new Proxy({}, { + get(target, propertyName) { + const symbol = { + type: 'SYMBOL', + name: propertyName + }; + + if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) { + return symbol; + } else { + const error = new ReferenceError(`Undefined symbol '${propertyName}'`); + error.symbol = symbol; + return error; + } + } + }) +} + +function grammar(baseGrammar, options) { + if (!options) { + options = baseGrammar; + baseGrammar = { + name: null, + rules: {}, + extras: [normalize(/\s/)], + conflicts: [], + externals: [], + inline: [] + }; + } + + let externals = baseGrammar.externals; + if (options.externals) { + if (typeof options.externals !== "function") { + throw new Error("Grammar's 'externals' property must be a function."); + } + + const externalsRuleBuilder = RuleBuilder(null) + const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals); + + if (!Array.isArray(externalRules)) { + throw new Error("Grammar's 'externals' property must return an array of rules."); + } + + externals = externalRules.map(normalize); + } + + const ruleMap = {}; + for (const key in options.rules) { + ruleMap[key] = true; + } + for (const key in baseGrammar.rules) { + ruleMap[key] = true; + } + for (const external of externals) { + if (typeof external.name === 'string') { + ruleMap[external.name] = true; + } + } + + const ruleBuilder = RuleBuilder(ruleMap); + + const name = options.name; + if (typeof name !== "string") { + throw new Error("Grammar's 'name' property must be a string."); + } + + if (!/^[a-zA-Z_]\w*$/.test(name)) { + throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters."); + } + + let rules = Object.assign({}, baseGrammar.rules); + if (options.rules) { + if (typeof options.rules !== "object") { + throw new Error("Grammar's 'rules' property must be an object."); + } + + for (const ruleName in options.rules) { + const ruleFn = options.rules[ruleName]; + if (typeof ruleFn !== "function") { + throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not."); + } + rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName])); + } + } + + let extras = baseGrammar.extras.slice(); + if (options.extras) { + if (typeof options.extras !== "function") { + throw new Error("Grammar's 'extras' property must be a function."); + } + + extras = options.extras + .call(ruleBuilder, ruleBuilder, baseGrammar.extras) + .map(normalize); + } + + let word = baseGrammar.word; + if (options.word) { + word = options.word.call(ruleBuilder, ruleBuilder).name; + if (typeof word != 'string') { + throw new Error("Grammar's 'word' property must be a named rule."); + } + } + + let conflicts = baseGrammar.conflicts; + if (options.conflicts) { + if (typeof options.conflicts !== "function") { + throw new Error("Grammar's 'conflicts' property must be a function."); + } + + const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym)); + const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules); + + if (!Array.isArray(conflictRules)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + conflicts = conflictRules.map(conflictSet => { + if (!Array.isArray(conflictSet)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + return conflictSet.map(symbol => symbol.name); + }); + } + + let inline = baseGrammar.inline; + if (options.inline) { + if (typeof options.inline !== "function") { + throw new Error("Grammar's 'inline' property must be a function."); + } + + const baseInlineRules = baseGrammar.inline.map(sym); + const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules); + + if (!Array.isArray(inlineRules)) { + throw new Error("Grammar's inline must be an array of rules."); + } + + inline = inlineRules.map(symbol => symbol.name); + } + + if (Object.keys(rules).length == 0) { + throw new Error("Grammar must have at least one rule."); + } + + return {name, word, rules, extras, conflicts, externals, inline}; + } + +global.alias = alias; +global.blank = blank; +global.choice = choice; +global.optional = optional; +global.prec = prec; +global.repeat = repeat; +global.repeat1 = repeat1; +global.seq = seq; +global.sym = sym; +global.token = token; +global.grammar = grammar; diff --git a/src/main.rs b/src/main.rs index 9dc9efb2..c7ca2ca5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,15 @@ -use clap::{App, Arg, SubCommand}; +#[macro_use] +extern crate serde_derive; +#[macro_use] +extern crate serde_json; +#[macro_use] +extern crate lazy_static; -#[macro_use] extern crate serde_derive; -#[macro_use] extern crate serde_json; -#[macro_use] extern crate lazy_static; +use std::path::PathBuf; +use clap::{App, Arg, SubCommand}; +use std::env; +use std::io::Write; +use std::process::{Command, Stdio}; mod build_tables; mod error; @@ -20,25 +27,59 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") + .subcommand(SubCommand::with_name("generate").about("Generate a parser")) .subcommand( - SubCommand::with_name("generate") - .about("Generate a parser") - ).subcommand( SubCommand::with_name("parse") .about("Parse a file") - .arg(Arg::with_name("path").index(1)) - ).subcommand( + .arg(Arg::with_name("path").index(1)), + ) + .subcommand( SubCommand::with_name("test") .about("Run a parser's tests") .arg(Arg::with_name("path").index(1).required(true)) .arg(Arg::with_name("line").index(2).required(true)) - .arg(Arg::with_name("column").index(3).required(true)) - ).get_matches(); + .arg(Arg::with_name("column").index(3).required(true)), + ) + .get_matches(); if let Some(matches) = matches.subcommand_matches("generate") { - let code = generate::generate_parser_for_grammar(String::new())?; + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); + grammar_path.push("grammar.js"); + let grammar_json = load_js_grammar_file(grammar_path); + let code = generate::generate_parser_for_grammar(&grammar_json)?; println!("{}", code); } Ok(()) } + +fn load_js_grammar_file(grammar_path: PathBuf) -> String { + let mut node_process = Command::new("node") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .expect("Failed to run `node`"); + + let js_prelude = include_str!("./js/dsl.js"); + let mut node_stdin = node_process + .stdin + .take() + .expect("Failed to open stdin for node"); + write!( + node_stdin, + "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", + js_prelude, + grammar_path.to_str().unwrap() + ).expect("Failed to write to node's stdin"); + drop(node_stdin); + let output = node_process + .wait_with_output() + .expect("Failed to read output from node"); + match output.status.code() { + None => panic!("Node process was killed"), + Some(0) => {} + Some(code) => panic!(format!("Node process exited with status {}", code)), + } + + String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") +} diff --git a/src/nfa.rs b/src/nfa.rs index bc084ede..f6acb67a 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -23,6 +23,12 @@ pub struct Nfa { pub states: Vec } +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + #[derive(Debug)] pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, diff --git a/src/prepare_grammar/extract_simple_aliases.rs b/src/prepare_grammar/extract_simple_aliases.rs index 8b87ea2e..ff7204a0 100644 --- a/src/prepare_grammar/extract_simple_aliases.rs +++ b/src/prepare_grammar/extract_simple_aliases.rs @@ -22,6 +22,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if step.alias.is_none() { @@ -49,6 +50,7 @@ pub(super) fn extract_simple_aliases( Symbol { kind: SymbolType::External, index} => &external_status_list[index], Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), }; if status.alias.is_some() { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index d53555af..eaeede90 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -67,10 +67,13 @@ pub(super) fn extract_tokens( .expected_conflicts .into_iter() .map(|conflict| { - conflict + let mut result: Vec<_> = conflict .iter() .map(|symbol| symbol_replacer.replace_symbol(*symbol)) - .collect() + .collect(); + result.sort_unstable(); + result.dedup(); + result }) .collect(); diff --git a/src/render/mod.rs b/src/render/mod.rs index 5bd11a34..2ca610a6 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,6 +1,188 @@ -use crate::rules::{Symbol, AliasMap}; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; -use crate::tables::{ParseTable, LexTable}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use std::collections::{HashMap, HashSet}; +use std::fmt::Write; + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + for _ in 0..$this.indent_level { + write!(&mut $this.buffer, " ").unwrap(); + } + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + $this.buffer += "\n"; + } +} + +struct Generator { + buffer: String, + indent_level: usize, + + language_name: String, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, + symbol_ids: HashMap, + parse_table_entries: Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: usize, + unique_aliases: HashSet, +} + +impl Generator { + fn generate(mut self) -> String { + self.add_includes(); + self.add_pragmas(); + self.add_stats(); + self.add_symbol_enum(); + self.add_symbol_names_list(); + self.buffer + } + + fn add_includes(&mut self) { + add_line!(self, "#include "); + add_line!(self, ""); + } + + fn add_pragmas(&mut self) { + add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); + add_line!(self, "#pragma GCC diagnostic push"); + add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!(self, "#endif"); + add_line!(self, ""); + + // Compiling large lexer functions can be very slow, especially when + // using Visual Studio on Windows. Disabling optimizations is not + // ideal, but only a very small fraction of overall parse time is + // spent lexing, so the performance impact of this is pretty small. + if self.main_lex_table.states.len() > 500 { + add_line!(self, "#ifdef _MSC_VER"); + add_line!(self, "#pragma optimize(\"\", off)"); + add_line!(self, "#endif"); + add_line!(self, ""); + } + } + + fn add_stats(&mut self) { + let mut token_count = 0; + + for symbol in &self.parse_table.symbols { + if symbol.is_terminal() { + token_count += 1; + } else if symbol.is_external() { + let external_token = &self.syntax_grammar.external_tokens[symbol.index]; + if external_token.corresponding_internal_token.is_none() { + token_count += 1; + } + } + } + + for alias_sequence in &self.parse_table.alias_sequences { + for entry in alias_sequence { + if let Some(alias) = entry { + self.unique_aliases.insert(alias.clone()); + } + } + } + + let mut symbol_id_values = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); + } + + add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); + add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); + add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!(self, "#define TOKEN_COUNT {}", token_count); + add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); + // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!(self, ""); + } + + fn add_symbol_enum(&mut self) { + add_line!(self, "enum {{"); + self.indent(); + for i in 0..self.parse_table.symbols.len() { + let symbol = self.parse_table.symbols[i]; + if symbol != Symbol::end() { + add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + } + } + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_symbol_names_list(&mut self) { + add_line!(self, "static const char *ts_symbol_names[] = {{"); + self.indent(); + self.dedent(); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = self.metadata_for_symbol(symbol); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", self.sanitize_name(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_ids.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_ids.insert(id.clone()); + self.symbol_ids.insert(symbol, id); + } + + fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { + match symbol.kind { + SymbolType::End => ("end", VariableType::Auxiliary), + SymbolType::NonTerminal => { + let variable = &self.syntax_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::External => { + let token = &self.syntax_grammar.external_tokens[symbol.index]; + (&token.name, token.kind) + } + } + } + + fn sanitize_name(&self, name: &str) -> String { + name.to_string() + } + + fn indent(&mut self) { + self.indent_level += 1; + } + + fn dedent(&mut self) { + self.indent_level -= 1; + } +} pub(crate) fn render_c_code( name: &str, @@ -12,5 +194,21 @@ pub(crate) fn render_c_code( lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, ) -> String { - unimplemented!(); + Generator { + buffer: String::new(), + indent_level: 0, + language_name: name.to_string(), + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + symbol_ids: HashMap::new(), + parse_table_entries: Vec::new(), + next_parse_action_list_index: 0, + unique_aliases: HashSet::new(), + } + .generate() } diff --git a/src/rules.rs b/src/rules.rs index 9374a283..34f4c8b9 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { External, Terminal, NonTerminal, + End, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -33,7 +34,7 @@ pub(crate) struct MetadataParams { pub alias: Option, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) struct Symbol { pub kind: SymbolType, pub index: usize, @@ -56,6 +57,15 @@ pub(crate) enum Rule { } impl Rule { + pub fn alias(content: Rule, value: String, is_named: bool) -> Self { + add_metadata(content, move |params| { + params.alias = Some(Alias { + is_named, + value + }); + }) + } + pub fn token(content: Rule) -> Self { add_metadata(content, |params| { params.is_token = true; @@ -169,6 +179,13 @@ impl Symbol { index, } } + + pub fn end() -> Self { + Symbol { + kind: SymbolType::End, + index: 0, + } + } } impl From for Rule { @@ -177,7 +194,7 @@ impl From for Rule { } } -fn add_metadata(input: Rule, f: T) -> Rule { +fn add_metadata(input: Rule, f: T) -> Rule { match input { Rule::Metadata { rule, mut params } => { f(&mut params); diff --git a/src/tables.rs b/src/tables.rs index de66253c..9100b81e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -6,20 +6,13 @@ pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum ParseActionType { - Error, - Shift, - Reduce, - Accept, - Recover, -} - #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum ParseAction { Accept, - Error, - Shift(ParseStateId), + Shift { + state: ParseStateId, + is_repetition: bool, + }, ShiftExtra, Recover, Reduce { @@ -28,50 +21,69 @@ pub(crate) enum ParseAction { precedence: i32, dynamic_precedence: i32, associativity: Option, - alias_sequence_id: Option, - is_repetition: bool, + alias_sequence_id: AliasSequenceId, } } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseTableEntry { - actions: Vec, - reusable: bool, + pub actions: Vec, + pub reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { - terminal_entries: HashMap, - nonterminal_entries: HashMap + pub terminal_entries: HashMap, + pub nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] pub(crate) struct ParseTable { - states: Vec, - alias_sequences: Vec>, + pub states: Vec, + pub symbols: Vec, + pub alias_sequences: Vec>>, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { - state: LexStateId, - precedence: Range, - in_main_token: bool, + pub state: LexStateId, + pub precedence: Range, + pub in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AcceptTokenAction { - symbol: Symbol, - precedence: i32, - implicit_precedence: i32, + pub symbol: Symbol, + pub precedence: i32, + pub implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - advance_actions: HashMap, - accept_action: Option, + pub advance_actions: HashMap, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)] pub(crate) struct LexTable { - states: Vec, + pub states: Vec, +} + +impl ParseTableEntry { + pub fn new() -> Self { + Self { + reusable: true, + actions: Vec::new(), + } + } +} + +impl ParseAction { + pub fn precedence(&self) -> i32 { + if let ParseAction::Reduce { precedence, .. } = self { + *precedence + } else { + 0 + } + } }