diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index bd790b29..5351f72e 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -41,12 +41,11 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, - following_tokens: Vec, state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, Vec)> { + fn build(mut self) -> Result { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -99,7 +98,7 @@ impl<'a> ParseTableBuilder<'a> { self.remove_precedences(); - Ok((self.parse_table, self.following_tokens)) + Ok(self.parse_table) } fn add_parse_state( @@ -108,20 +107,6 @@ impl<'a> ParseTableBuilder<'a> { preceding_auxiliary_symbols: &AuxiliarySymbolSequence, item_set: ParseItemSet<'a>, ) -> ParseStateId { - if preceding_symbols.len() > 1 { - let left_tokens = self - .item_set_builder - .last_set(&preceding_symbols[preceding_symbols.len() - 2]); - let right_tokens = self - .item_set_builder - .first_set(&preceding_symbols[preceding_symbols.len() - 1]); - for left_token in left_tokens.iter() { - if left_token.is_terminal() { - self.following_tokens[left_token.index].insert_all(right_tokens); - } - } - } - let mut hasher = DefaultHasher::new(); item_set.hash_unfinished_items(&mut hasher); let unfinished_item_signature = hasher.finish(); @@ -705,17 +690,50 @@ impl<'a> ParseTableBuilder<'a> { } } +fn populate_following_tokens( + result: &mut Vec, + grammar: &SyntaxGrammar, + inlines: &InlinedProductionMap, + builder: &ParseItemSetBuilder, +) { + let productions = grammar + .variables + .iter() + .flat_map(|v| &v.productions) + .chain(&inlines.productions); + for production in productions { + for i in 1..production.steps.len() { + let left_tokens = builder.last_set(&production.steps[i - 1].symbol); + let right_tokens = builder.first_set(&production.steps[i].symbol); + for left_token in left_tokens.iter() { + if left_token.is_terminal() { + result[left_token.index].insert_all_terminals(right_tokens); + } + } + } + } +} + pub(crate) fn build_parse_table( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, state_ids_to_log: Vec, ) -> Result<(ParseTable, Vec)> { - ParseTableBuilder { + let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines); + let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()]; + populate_following_tokens( + &mut following_tokens, + syntax_grammar, + inlines, + &item_set_builder, + ); + + let table = ParseTableBuilder { syntax_grammar, lexical_grammar, state_ids_to_log, - item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + item_set_builder, state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), parse_state_queue: VecDeque::new(), @@ -725,7 +743,8 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), max_aliased_production_length: 0, }, - following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()], } - .build() + .build()?; + + Ok((table, following_tokens)) } diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 6c74d465..9f3307dd 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -48,7 +48,11 @@ pub(crate) struct ParseItemDisplay<'a>( pub &'a LexicalGrammar, ); -pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct TokenSetDisplay<'a>( + pub &'a TokenSet, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, +); #[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( @@ -134,30 +138,42 @@ impl TokenSet { vec.set(other.index, true); } - pub fn insert_all(&mut self, other: &TokenSet) -> bool { + pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool { let mut result = false; if other.terminal_bits.len() > self.terminal_bits.len() { self.terminal_bits.resize(other.terminal_bits.len(), false); } - if other.external_bits.len() > self.external_bits.len() { - self.external_bits.resize(other.external_bits.len(), false); - } for (i, element) in other.terminal_bits.iter().enumerate() { if element { result |= !self.terminal_bits[i]; self.terminal_bits.set(i, element); } } + result + } + + fn insert_all_externals(&mut self, other: &TokenSet) -> bool { + let mut result = false; + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } for (i, element) in other.external_bits.iter().enumerate() { if element { result |= !self.external_bits[i]; self.external_bits.set(i, element); } } + result + } + + pub fn insert_all(&mut self, other: &TokenSet) -> bool { + let mut result = false; if other.eof { result |= !self.eof; self.eof = true; } + result |= self.insert_all_terminals(other); + result |= self.insert_all_externals(other); result } } diff --git a/cli/src/generate/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs index b941b179..9a929f05 100644 --- a/cli/src/generate/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -1,6 +1,6 @@ -use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet}; +use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet, TokenSetDisplay}; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::rules::Symbol; +use crate::generate::rules::{Symbol, SymbolType}; use hashbrown::{HashMap, HashSet}; use std::fmt; @@ -268,7 +268,7 @@ impl<'a> ParseItemSetBuilder<'a> { } pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { - &self.first_sets[symbol] + &self.last_sets[symbol] } fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) { @@ -300,6 +300,40 @@ impl<'a> fmt::Debug for ParseItemSetBuilder<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "ParseItemSetBuilder {{\n")?; + write!(f, " first_sets: {{\n")?; + for (symbol, first_set) in &self.first_sets { + let name = match symbol.kind { + SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name, + SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name, + SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name, + SymbolType::End => "END", + }; + write!( + f, + " first({:?}): {}\n", + name, + TokenSetDisplay(first_set, &self.syntax_grammar, &self.lexical_grammar) + )?; + } + write!(f, " }}\n")?; + + write!(f, " last_sets: {{\n")?; + for (symbol, last_set) in &self.last_sets { + let name = match symbol.kind { + SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name, + SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name, + SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name, + SymbolType::End => "END", + }; + write!( + f, + " last({:?}): {}\n", + name, + TokenSetDisplay(last_set, &self.syntax_grammar, &self.lexical_grammar) + )?; + } + write!(f, " }}\n")?; + write!(f, " additions: {{\n")?; for (i, variable) in self.syntax_grammar.variables.iter().enumerate() { write!(f, " {}: {{\n", variable.name)?; diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 1f89022a..1c4fc753 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -1,5 +1,5 @@ -use crate::generate::build_tables::item::TokenSet; -use crate::generate::grammars::LexicalGrammar; +use crate::generate::build_tables::item::{TokenSet, TokenSetDisplay}; +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; use std::cmp::Ordering; @@ -16,6 +16,7 @@ struct TokenConflictStatus { pub(crate) struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, + following_tokens: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, grammar: &'a LexicalGrammar, @@ -25,7 +26,7 @@ impl<'a> TokenConflictMap<'a> { pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); - let following_chars = get_following_chars(&starting_chars, following_tokens); + let following_chars = get_following_chars(&starting_chars, &following_tokens); let n = grammar.variables.len(); let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; @@ -40,6 +41,7 @@ impl<'a> TokenConflictMap<'a> { TokenConflictMap { n, status_matrix, + following_tokens, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, grammar, @@ -115,9 +117,27 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; + let syntax_grammar = SyntaxGrammar::default(); + + write!(f, " following_tokens: {{\n")?; + for (i, following_tokens) in self.following_tokens.iter().enumerate() { + write!( + f, + " follow({:?}): {},\n", + self.grammar.variables[i].name, + TokenSetDisplay(following_tokens, &syntax_grammar, &self.grammar) + )?; + } + write!(f, " }},\n")?; + write!(f, " starting_characters: {{\n")?; for i in 0..self.n { - write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; + write!( + f, + " {:?}: {:?},\n", + self.grammar.variables[i].name, + self.starting_chars_by_index[i] + )?; } write!(f, " }},\n")?; @@ -169,10 +189,10 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, - following_tokens: Vec, + following_tokens: &Vec, ) -> Vec { following_tokens - .into_iter() + .iter() .map(|following_tokens| { let mut chars = CharacterSet::empty(); for token in following_tokens.iter() { diff --git a/cli/src/generate/grammars.rs b/cli/src/generate/grammars.rs index 3cedcd42..c9282da3 100644 --- a/cli/src/generate/grammars.rs +++ b/cli/src/generate/grammars.rs @@ -81,7 +81,7 @@ pub(crate) struct ExternalToken { pub corresponding_internal_token: Option, } -#[derive(Debug)] +#[derive(Debug, Default)] pub(crate) struct SyntaxGrammar { pub variables: Vec, pub extra_tokens: Vec,